def send_dialog_messages(self, dialog_list):
     tmpLog = self.make_logger(method_name='send_dialog_messages')
     tmpLog.debug('start')
     dataList = []
     for diagSpec in dialog_list:
         dataList.append(diagSpec.convert_to_propagate())
     data = dict()
     data['harvesterID'] = harvester_config.master.harvester_id
     data['dialogs'] = json.dumps(dataList)
     tmpLog.debug('send {0} messages'.format(len(dataList)))
     tmpStat, tmpRes = self.post_ssl('addHarvesterDialogs', data)
     errStr = 'OK'
     if tmpStat is False:
         errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             retCode, tmpStr = tmpRes.json()
             if not retCode:
                 errStr = core_utils.dump_error_message(tmpLog, tmpStr)
                 tmpStat = False
         except Exception:
             errStr = core_utils.dump_error_message(tmpLog)
             tmpLog.error('conversion failure from {0}'.format(tmpRes.text))
             tmpStat = False
     if tmpStat:
         tmpLog.debug('done with {0}'.format(errStr))
     return tmpStat, errStr
 def update_worker_stats(self, site_name, stats):
     tmpLog = self.make_logger(method_name='update_worker_stats')
     tmpLog.debug('start')
     data = dict()
     data['harvesterID'] = harvester_config.master.harvester_id
     data['siteName'] = site_name
     data['paramsList'] = json.dumps(stats)
     tmpLog.debug('update stats for {0}, stats: {1}'.format(site_name, stats))
     tmpStat, tmpRes = self.post_ssl('reportWorkerStats', data)
     errStr = 'OK'
     if tmpStat is False:
         errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             retCode, retMsg = tmpRes.json()
             if not retCode:
                 tmpStat = False
                 errStr = core_utils.dump_error_message(tmpLog, retMsg)
         except Exception:
             tmpStat = False
             errStr = core_utils.dump_error_message(tmpLog)
             tmpLog.error('conversion failure from {0}'.format(tmpRes.text))
     if tmpStat:
         tmpLog.debug('done with {0}:{1}'.format(tmpStat, errStr))
     return tmpStat, errStr
예제 #3
0
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     # create Globus Transfer Client
     tmpLog = self.make_logger(_logger, method_name='GoPreparator __init__ ')
     try:
         self.tc = None
         # need to get client_id and refresh_token from PanDA server via harvester cache mechanism
         tmpLog.debug('about to call dbInterface.get_cache(globus_secret)')
         c_data = self.dbInterface.get_cache('globus_secret')
         if (not c_data == None) and  c_data.data['StatusCode'] == 0 :
            tmpLog.debug('Got the globus_secrets from PanDA')
            self.client_id = c_data.data['publicKey']  # client_id
            self.refresh_token = c_data.data['privateKey'] # refresh_token
            tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token)
            if not tmpStat:
               self.tc = None
               errStr = 'failed to create Globus Transfer Client'
               tmpLog.error(errStr)
         else :
            self.client_id = None
            self.refresh_token = None
            self.tc = None
            errStr = 'failed to get Globus Client ID and Refresh Token'
            tmpLog.error(errStr)
     except:
         core_utils.dump_error_message(tmpLog)
     tmpLog.debug('__init__ finished')
 def feed_events(self, workspec, events_dict):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='feed_events')
     retVal = True
     if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]:
         # put the json just under the access point
         jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName)
         tmpLog.debug('feeding events to {0}'.format(jsonFilePath))
         try:
             with open(jsonFilePath, 'w') as jsonFile:
                 json.dump(events_dict, jsonFile)
         except Exception:
             core_utils.dump_error_message(tmpLog)
             retVal = False
     elif workspec.mapType == WorkSpec.MT_MultiJobs:
         # TOBEFIXED
         pass
     # remove request file
     try:
         jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName)
         os.remove(jsonFilePath)
     except Exception:
         pass
     tmpLog.debug('done')
     return retVal
 def is_alive(self, key_values):
     tmpLog = self.make_logger(method_name='is_alive')
     tmpLog.debug('start')
     # convert datetime
     for tmpKey, tmpVal in iteritems(key_values):
         if isinstance(tmpVal, datetime.datetime):
             tmpVal = 'datetime/' + tmpVal.strftime('%Y-%m-%d %H:%M:%S.%f')
             key_values[tmpKey] = tmpVal
     # send data
     data = dict()
     data['harvesterID'] = harvester_config.master.harvester_id
     data['data'] = json.dumps(key_values)
     tmpStat, tmpRes = self.post_ssl('harvesterIsAlive', data)
     retCode = False
     if tmpStat is False:
         tmpStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             retCode, tmpStr = tmpRes.json()
         except Exception:
             tmpStr = core_utils.dump_error_message(tmpLog)
             tmpLog.error('conversion failure from {0}'.format(tmpRes.text))
             tmpStat = False
     if tmpStat:
         tmpLog.debug('done with {0} : {1}'.format(retCode, tmpStr))
     return retCode, tmpStr
 def get_proxy(self, voms_role, cert=None):
     retVal = None
     retMsg = ''
     # get logger
     tmpLog = self.make_logger(method_name='get_proxy')
     tmpLog.debug('start')
     data = {'role': voms_role}
     tmpStat, tmpRes = self.post_ssl('getProxy', data, cert)
     if tmpStat is False:
         core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             tmpDict = tmpRes.json()
             if tmpDict['StatusCode'] == 0:
                 retVal = tmpDict['userProxy']
             else:
                 retMsg = tmpDict['errorDialog']
                 core_utils.dump_error_message(tmpLog, retMsg)
                 tmpStat = False
         except Exception:
             retMsg = core_utils.dump_error_message(tmpLog, tmpRes)
             tmpStat = False
     if tmpStat:
         tmpLog.debug('done with {0}'.format(str(retVal)))
     return retVal, retMsg
 def check_event_availability(self, jobspec):
     retStat = False
     retVal = None
     tmpLog = self.make_logger('PandaID={0}'.format(jobspec.PandaID),
                               method_name='check_event_availability')
     tmpLog.debug('start')
     data = dict()
     data['taskID'] = jobspec.taskID
     data['pandaID'] = jobspec.PandaID
     if jobspec.jobsetID is None:
         data['jobsetID'] = jobspec.jobParams['jobsetID']
     else:
         data['jobsetID'] = jobspec.jobsetID
     tmpStat, tmpRes = self.post_ssl('checkEventsAvailability', data)
     if tmpStat is False:
         core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             tmpDict = tmpRes.json()
             if tmpDict['StatusCode'] == 0:
                 retStat = True
                 retVal = tmpDict['nEventRanges']
         except Exception:
             core_utils.dump_error_message(tmpLog, tmpRes)
     tmpLog.debug('done with {0}'.format(retVal))
     return retStat, retVal
 def update_workers(self, workspec_list):
     tmpLog = self.make_logger(method_name='update_workers')
     tmpLog.debug('start')
     dataList = []
     for workSpec in workspec_list:
         dataList.append(workSpec.convert_to_propagate())
     data = dict()
     data['harvesterID'] = harvester_config.master.harvester_id
     data['workers'] = json.dumps(dataList)
     tmpLog.debug('update {0} workers'.format(len(dataList)))
     tmpStat, tmpRes = self.post_ssl('updateWorkers', data)
     retList = None
     errStr = 'OK'
     if tmpStat is False:
         errStr = core_utils.dump_error_message(tmpLog, tmpRes)
     else:
         try:
             retCode, retList = tmpRes.json()
             if not retCode:
                 errStr = core_utils.dump_error_message(tmpLog, retList)
                 retList = None
                 tmpStat = False
         except Exception:
             errStr = core_utils.dump_error_message(tmpLog)
             tmpLog.error('conversion failure from {0}'.format(tmpRes.text))
             tmpStat = False
     if tmpStat:
         tmpLog.debug('done with {0}'.format(errStr))
     return retList, errStr
예제 #9
0
def rucio_create_dataset(tmpLog,datasetScope,datasetName):
    # create the dataset
    try:
        # register dataset
        lifetime = 7*24*60*60
        tmpLog.debug('register {0}:{1} lifetime = {2}'
                     .format(datasetScope, datasetName,lifetime))
        try:                
            executable = ['/usr/bin/env',
                          'rucio', 'add-dataset']
            executable += [ '--lifetime',('%d' %lifetime)]
            executable += [datasetName]
            
            #print executable 

            tmpLog.debug('rucio add-dataset command: {0} '.format(executable))
            tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable))

            process = subprocess.Popen(executable,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT)

            stdout,stderr = process.communicate()
                
            if process.returncode == 0:
                tmpLog.debug(stdout)
                return True,''
            else:
                # check what failed
                dataset_exists = False
                rucio_sessions_limit_error = False
                for line in stdout.split('\n'):
                    if 'Data Identifier Already Exists' in line:
                        dataset_exists = True
                        break
                    elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line:
                        rucio_sessions_limit_error = True
                        break
                if dataset_exists:
                    errMsg = 'dataset {0}:{1} already exists'.format(datasetScope,
                                                                     datasetName)
                    tmpLog.debug(errMsg)
                    return True,errMsg
                elif rucio_sessions_limit_error:
                    # do nothing
                    errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout)
                    tmpLog.warning(errStr)
                    return None,errStr
                else:
                    # some other Rucio error 
                    errStr = 'Rucio returned error : stdout: {0}'.format(stdout)
                    tmpLog.error(errStr)
                    return False,errStr
            except Exception:
                errMsg = 'Could not create dataset {0}:{1}'.format(datasetScope,
                                                                   datasetName)
                core_utils.dump_error_message(tmpLog)
                tmpLog.error(errMsg)
                return False,errMsg
 def feed_jobs(self, workspec, jobspec_list):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='feed_jobs')
     retVal = True
     # get PFC
     pfc = core_utils.make_pool_file_catalog(jobspec_list)
     pandaIDs = []
     for jobSpec in jobspec_list:
         accessPoint = self.get_access_point(workspec, jobSpec.PandaID)
         jobSpecFilePath = os.path.join(accessPoint, jobSpecFileName)
         xmlFilePath = os.path.join(accessPoint, xmlPoolCatalogFileName)
         tmpLog.debug('feeding jobs to {0}'.format(jobSpecFilePath))
         try:
             # put job spec file
             with open(jobSpecFilePath, 'w') as jobSpecFile:
                 jobParams = jobSpec.get_job_params(self.stripJobParams)
                 if self.jobSpecFileFormat == 'cgi':
                     jobSpecFile.write(urlencode(jobParams))
                 else:
                     json.dump({jobSpec.PandaID: jobParams}, jobSpecFile)
             # put PFC.xml
             with open(xmlFilePath, 'w') as pfcFile:
                 pfcFile.write(pfc)
             # make symlink
             inFiles = jobSpec.get_input_file_attributes()
             for inLFN, inFile in iteritems(inFiles):
                 dstPath = os.path.join(accessPoint, inLFN)
                 if 'path' in inFile and inFile['path'] != dstPath:
                     # test if symlink exists if so remove it
                     if os.path.exists(dstPath):
                         os.unlink(dstPath)
                         tmpLog.debug("removing existing symlink %s" % dstPath)
                     os.symlink(inFile['path'], dstPath)
             pandaIDs.append(jobSpec.PandaID)
         except Exception:
             core_utils.dump_error_message(tmpLog)
             retVal = False
     # put PandaIDs file
     try:
         jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile)
         with open(jsonFilePath, 'w') as jsonPandaIDsFile:
             json.dump(pandaIDs, jsonPandaIDsFile)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         retVal = False
     # remove request file
     try:
         reqFilePath = os.path.join(workspec.get_access_point(), jsonJobRequestFileName)
         os.remove(reqFilePath)
     except Exception:
         pass
     tmpLog.debug('done')
     return retVal
예제 #11
0
 def is_alive(self, workspec, worker_heartbeat_limit):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.is_alive(self.original_config, workspec, worker_heartbeat_limit)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
예제 #12
0
 def check_workers(self, workspec_list):
     tmpLog = core_utils.make_logger(_logger, method_name='check_workers')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.check_workers(self.original_config, workspec_list)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
예제 #13
0
 def post_processing(self, workspec, jobspec_list, map_type):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.post_processing(self.original_config, workspec, jobspec_list, map_type)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
예제 #14
0
 def acknowledge_events_files(self, workspec):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='acknowledge_events_files')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.acknowledge_events_files(self.original_config, workspec)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
예제 #15
0
 def kill_requested(self, workspec):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.kill_requested(self.original_config, workspec)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
예제 #16
0
def rucio_add_files_to_dataset(tmpLog,datasetScope,datasetName,fileList):
    # add files to dataset 
    try:
        #create the to DID
        to_did = '{0}:{1}'.format(datasetScope,datasetName)
        executable = ['/usr/bin/env',
                      'rucio', 'attach', to_did]
        # loop over the files to add
        for filename in fileList:
            from_did = '{0}:{1}'.format(filename['scope'],filename['name'])
            executable += [from_did]

        #print executable 
        tmpLog.debug('rucio attach command: {0} '.format(executable))
        tmpLog.debug('rucio attach command (for human): %s ' % ' '.join(executable))

        process = subprocess.Popen(executable,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

        stdout,stderr = process.communicate()
                
        if process.returncode == 0:
            tmpLog.debug(stdout)
            return True,''
        else:
            # check what failed
            rucio_sessions_limit_error = False
            for line in stdout.split('\n'):
                if 'exceeded simultaneous SESSIONS_PER_USER limit' in line:
                    rucio_sessions_limit_error = True
                    break
            if rucio_sessions_limit_error:
                # do nothing
                errStr = 'Rucio returned Sessions Limit error, will retry: stdout: {0}'.format(stdout)
                tmpLog.warning(errStr)
                return None,errStr
            else:
                # some other Rucio error 
                errStr = 'Rucio returned error : stdout: {0}'.format(stdout)
                tmpLog.error(errStr)
                return False,errStr
            #except FileAlreadyExists:
            #    # ignore if files already exist
            #    pass
    except Exception:
        errMsg = 'Could not add files to DS - {0}:{1} files - {2}'.format(datasetScope,
                                                                          datasetName,
                                                                          fileList)
        core_utils.dump_error_message(tmpLog)
        tmpLog.error(errMsg)
        return False,errMsg
예제 #17
0
    def feed_events(self, workspec, events_dict):
        '''Havester has an event range to pass to job'''

        # get logger
        arclog = arc_utils.ARCLogger(baselogger, workspec.workerID)
        tmpLog = arclog.log

        # Upload to jobid/jsonEventsFeedFileName, delete jobid/jsonEventsRequestFileName
        job = workspec.workAttributes['arcjob']
        arcid = job['JobID']
        # Set certificate to use for interacting with ARC CE
        usercfg = arc.UserConfig(self.cred_type)
        if not self._setup_proxy(usercfg, workspec, arcid, tmpLog):
            return False

        retVal = True
        if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]:
            # put the json just under the access point then upload to ARC CE
            localJsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName)
            tmpLog.debug('feeding events to {0}'.format(localJsonFilePath))
            try:
                with open(localJsonFilePath, 'w') as jsonFile:
                    json.dump(events_dict, jsonFile)
            except Exception:
                core_utils.dump_error_message(tmpLog)
                retVal = False

            remoteJsonFilePath = '%s/%s' % (arcid, jsonEventsFeedFileName)
            # Try to copy the file
            status = self._copy_file(localJsonFilePath, remoteJsonFilePath, usercfg, tmpLog)
            if not status:
                tmpLog.error('Failed to feed events to {0}: {1}'.format(remoteJsonFilePath, str(status)))
                retVal = False
            else:
                remoteJsonEventsRequestFile = '%s/%s' % (arcid, jsonEventsRequestFileName)
                status = self._delete_file(remoteJsonEventsRequestFile, usercfg, tmpLog)
                if not status and status.GetErrno() != errno.ENOENT:
                    tmpLog.error('Failed to delete event request file at {0}'.format(remoteJsonEventsRequestFile))

        elif workspec.mapType == WorkSpec.MT_MultiJobs:
            # TOBEFIXED
            pass
        # remove request file
        try:
            jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName)
            os.remove(jsonFilePath)
        except Exception:
            pass
        tmpLog.debug('done')
        return retVal
예제 #18
0
def rucio_rule_info(tmpLog,rucioRule):
    # get rule-info
    tmpLog.debug('rucio rule-info {0}'.format(rucioRule))
    try:                
        executable = ['/usr/bin/env',
                      'rucio', 'rule-info',rucioRule]
        #print executable 

        tmpLog.debug('rucio rule-info command: {0} '.format(executable))
        tmpLog.debug('rucio rule-info command (for human): %s ' % ' '.join(executable))

        process = subprocess.Popen(executable,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

        stdout,stderr = process.communicate()
                
        if process.returncode == 0:
            tmpLog.debug(stdout)
            # parse the output to get the state:
            for line in stdout.split('\n'):
                if 'State:' in line:
                    # get the State varible
                    result = line.split()
                    return True,result
            return None,''
        else:
            # check what failed
            rucio_sessions_limit_error = False
            for line in stdout.split('\n'):
                if 'exceeded simultaneous SESSIONS_PER_USER limit' in line:
                    rucio_sessions_limit_error = True
                    break

            if rucio_sessions_limit_error:
                # do nothing
                errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout)
                tmpLog.warning(errStr)
                return None,errStr
            else:
                # some other Rucio error 
                errStr = 'Rucio returned error : stdout: {0}'.format(stdout)
                tmpLog.error(errStr)
                return False,errStr
    except Exception:
        errMsg = 'Could not run rucio rule-info {0}'.format(rucioRule)
        core_utils.dump_error_message(tmpLog)
        tmpLog.error(errMsg)
        return False,errMsg
예제 #19
0
 def catch_sigkill(sig, frame):
     disable_profiler()
     _logger.info('got signal={0} to be killed'.format(sig))
     try:
         os.remove(options.pid)
     except Exception:
         pass
     try:
         if os.getppid() == 1:
             os.killpg(os.getpgrp(), signal.SIGKILL)
         else:
             os.kill(os.getpid(), signal.SIGKILL)
     except Exception:
         core_utils.dump_error_message(_logger)
         _logger.error('failed to be killed')
예제 #20
0
def rucio_add_rule(tmpLog,datasetScope,datasetName,dstRSE):
    # add rule
    try:
        tmpLog.debug('rucio add-rule {0}:{1} 1 {2}'.format(datasetScope, datasetName,
                                                           dstRSE))
        did = '{0}:{1}'.format(datasetScope,datasetName)
        executable = ['/usr/bin/env',
                      'rucio', 'add-rule',did,'1',dstRSE]

        #print executable 

        tmpLog.debug('rucio add-rule command: {0} '.format(executable))
        tmpLog.debug('rucio add-rule command (for human): %s ' % ' '.join(executable))

        process = subprocess.Popen(executable,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

        stdout,stderr = process.communicate()
                
        if process.returncode == 0:
            tmpLog.debug(stdout)
            #parse stdout for rule id
            rule_id = stdout.split('\n')[0]
            return True,rule_id
        else:
            # check what failed
            rucio_sessions_limit_error = False
            for line in stdout.split('\n'):
                if 'exceeded simultaneous SESSIONS_PER_USER limit' in line:
                    rucio_sessions_limit_error = True
                    break
            if rucio_sessions_limit_error:
                # do nothing
                errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout)
                tmpLog.warning(errStr)
                return None,errStr
            else:
                # some other Rucio error 
                errStr = 'Rucio returned error : stdout: {0}'.format(stdout)
                tmpLog.error(errStr)
                return False,errStr
    except Exception:
        core_utils.dump_error_message(tmpLog)
                # treat as a temporary error
        tmpStat = False
        tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName)
        return tmpStat,tmpMsg
예제 #21
0
def application(environ, start_response):
    try:
        # get params
        try:
            request_body_size = int(environ.get('CONTENT_LENGTH', 0))
        except:
            request_body_size = 0
        # check token
        try:
            auth_str = environ.get('HTTP_AUTHORIZATION', '').split()[-1]
            token = HarvesterToken()
            payload = token.get_payload(auth_str)
        except:
            errMsg = 'Auth failed: Invalid token'
            start_response('403 Forbidden', [('Content-Type', 'text/plain')])
            return [errMsg.encode('ascii')]
        request_body = environ['wsgi.input'].read(request_body_size)
        params = json.loads(request_body)
        # make handler
        handler = ApacheHandler(None, None, None)
        handler.set_form(params)
        # execute
        handler.do_POST()
        # make response
        _logger.debug("{0} Phrase".format(handler.responseCode))
        start_response("{0} Phrase".format(handler.responseCode), handler.headerList)
        return [handler.message]
    except:
        errMsg = core_utils.dump_error_message(_logger)
        start_response('500 Phrase', [('Content-Type', 'text/plain')])
        return [errMsg]
예제 #22
0
 def zip_output(self, jobspec):
     # make logger
     tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID),
                               method_name='zip_output')
     tmpLog.debug('start')
     try:
         for fileSpec in jobspec.outFiles:
             if self.zipDir == "${SRCDIR}":
                 # the same directory as src
                 zipDir = os.path.dirname(next(iter(fileSpec.associatedFiles)).path)
             else:
                 zipDir = self.zipDir
             zipPath = os.path.join(zipDir, fileSpec.lfn)
             # remove zip file just in case
             try:
                 os.remove(zipPath)
             except:
                 pass
             # make zip file
             with zipfile.ZipFile(zipPath, "w", zipfile.ZIP_STORED) as zf:
                 for assFileSpec in fileSpec.associatedFiles:
                     zf.write(assFileSpec.path,os.path.basename(assFileSpec.path))
             # set path
             fileSpec.path = zipPath
             # get size
             statInfo = os.stat(zipPath)
             fileSpec.fsize = statInfo.st_size
     except:
         errMsg = core_utils.dump_error_message(tmpLog)
         return False, 'failed to zip with {0}'.format(errMsg)
     tmpLog.debug('done')
     return True, ''
예제 #23
0
 def check_credential(self):
     # make logger
     mainLog = self.make_logger(_logger, method_name='check_credential')
     comStr = "grid-proxy-info -exists -hours 72 -file {0}".format(self.outCertFile)
     mainLog.debug(comStr)
     try:
         p = subprocess.Popen(comStr.split(),
                              shell=False,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)
         stdOut, stdErr = p.communicate()
         retCode = p.returncode
     except:
         core_utils.dump_error_message(mainLog)
         return False
     mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr))
     return retCode == 0
예제 #24
0
 def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None):
     tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type),
                                     method_name='make_workers')
     tmpLog.debug('start')
     try:
         # get plugin
         if maker is None:
             maker = self.pluginFactory.get_plugin(queue_config.workerMaker)
         if maker is None:
             # not found
             tmpLog.error('plugin for {0} not found'.format(queue_config.queueName))
             return [], jobchunk_list
         # get ready workers
         readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready)
         # loop over all chunks
         okChunks = []
         ngChunks = []
         for iChunk, jobChunk in enumerate(jobchunk_list):
             # make a worker
             if iChunk >= n_ready:
                 workSpec = maker.make_worker(jobChunk, queue_config, resource_type)
             else:
                 # use ready worker
                 if iChunk < len(readyWorkers):
                     workSpec = readyWorkers[iChunk]
                 else:
                     workSpec = None
             # failed
             if workSpec is None:
                 ngChunks.append(jobChunk)
                 continue
             # set workerID
             if workSpec.workerID is None:
                 workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID')
                 workSpec.configID = queue_config.configID
                 workSpec.isNew = True
             okChunks.append((workSpec, jobChunk))
         # dump
         tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks),
                                                                        len(ngChunks)))
         return okChunks, ngChunks
     except Exception:
         # dump error
         core_utils.dump_error_message(tmpLog)
         return [], jobchunk_list
예제 #25
0
    def get_job_stats(self):
        tmp_log = self.make_logger(method_name='get_job_stats')
        tmp_log.debug('start')

        tmp_stat, tmp_res = self.post_ssl('getJobStatisticsPerSite', {})
        stats = {}
        if tmp_stat is False:
            ret_msg = 'FAILED'
            core_utils.dump_error_message(tmp_log, tmp_res)
        else:
            try:
                stats = pickle.loads(tmp_res.content)
                ret_msg = 'OK'
            except Exception:
                ret_msg = 'Exception'
                core_utils.dump_error_message(tmp_log)

        return stats, ret_msg
예제 #26
0
 def get_event_ranges(self, data_map, scattered):
     retStat = False
     retVal = dict()
     try:
         getEventsChunkSize = harvester_config.pandacon.getEventsChunkSize
     except Exception:
         getEventsChunkSize = 5120
     for pandaID, data in iteritems(data_map):
         # get logger
         tmpLog = self.make_logger('PandaID={0}'.format(data['pandaID']),
                                   method_name='get_event_ranges')
         if 'nRanges' in data:
             nRanges = data['nRanges']
         else:
             nRanges = 1
         if scattered:
             data['scattered'] = True
         tmpLog.debug('start nRanges={0}'.format(nRanges))
         while nRanges > 0:
             # use a small chunk size to avoid timeout
             chunkSize = min(getEventsChunkSize, nRanges)
             data['nRanges'] = chunkSize
             tmpStat, tmpRes = self.post_ssl('getEventRanges', data)
             if tmpStat is False:
                 core_utils.dump_error_message(tmpLog, tmpRes)
             else:
                 try:
                     tmpDict = tmpRes.json()
                     if tmpDict['StatusCode'] == 0:
                         retStat = True
                         if data['pandaID'] not in retVal:
                             retVal[data['pandaID']] = []
                         retVal[data['pandaID']] += tmpDict['eventRanges']
                         # got empty
                         if len(tmpDict['eventRanges']) == 0:
                             break
                 except Exception:
                     core_utils.dump_error_message(tmpLog, tmpRes)
                     break
             nRanges -= chunkSize
         tmpLog.debug('done with {0}'.format(str(retVal)))
     return retStat, retVal
예제 #27
0
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     # make logger
     tmpLog = self.make_logger(_logger, 'ThreadID={0}'.format(threading.current_thread().ident),
                               method_name='GlobusBulkPreparator __init__ {} ')
     tmpLog.debug('__init__ start')
     self.thread_id = threading.current_thread().ident
     self.id = GlobusBulkPreparator.next_id
     GlobusBulkPreparator.next_id += 1
     with uLock:
         global uID
         self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX')
         uID += 1
         uID %= harvester_config.preparator.nThreads
     # create Globus Transfer Client
     try:
         self.tc = None
         # need to get client_id and refresh_token from PanDA server via harvester cache mechanism
         tmpLog.debug('about to call dbInterface.get_cache(globus_secret)')
         c_data = self.dbInterface.get_cache('globus_secret')
         if (not c_data == None) and  c_data.data['StatusCode'] == 0 :
             tmpLog.debug('Got the globus_secrets from PanDA')
             self.client_id = c_data.data['publicKey']  # client_id
             self.refresh_token = c_data.data['privateKey'] # refresh_token
             tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token)
             if not tmpStat:
                 self.tc = None
                 errStr = 'failed to create Globus Transfer Client'
                 tmpLog.error(errStr)
         else :
             self.client_id = None
             self.refresh_token = None
             self.tc = None
             errStr = 'failed to get Globus Client ID and Refresh Token'
             tmpLog.error(errStr)
     except:
         core_utils.dump_error_message(tmpLog)
     # tmp debugging
     tmpLog.debug('self.id = {0}'.format(self.id))
     tmpLog.debug('self.dummy_transfer_id = {0}'.format(self.dummy_transfer_id))
     # tmp debugging
     tmpLog.debug('__init__ finish')
예제 #28
0
 def submit_with_command(self, jdl_list, use_spool=False, tmp_str='', keep_temp_sdf=False):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_command')
     # Initialize
     errStr = ''
     batchIDs_list = []
     # make sdf temp file from jdls
     tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=(not keep_temp_sdf),
                                 suffix='_{0}_cluster_submit.sdf'.format(tmp_str))
     sdf_file = tmpFile.name
     tmpFile.write('\n\n'.join(jdl_list))
     tmpFile.flush()
     # make condor remote options
     name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else ''
     pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else ''
     spool_opt = '-remote -spool' if use_spool and self.condor_schedd else ''
     # command
     comStr = 'condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format(
                 sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt)
     # submit
     tmpLog.debug('submit with command: {0}'.format(comStr))
     try:
         p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True,
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         # check return code
         stdOut, stdErr = p.communicate()
         retCode = p.returncode
     except Exception as e:
         stdOut = ''
         stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
         retCode = 1
         errStr = '{0}: {1}'.format(e.__class__.__name__, e)
     finally:
         tmpFile.close()
     tmpLog.debug('retCode={0}'.format(retCode))
     if retCode == 0:
         # extract clusterid and n_jobs
         job_id_match = None
         for tmp_line_str in stdOut.split('\n'):
             job_id_match = re.search('^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str)
             if job_id_match:
                 break
         if job_id_match is not None:
             n_jobs = int(job_id_match.group(1))
             clusterid = job_id_match.group(2)
             batchIDs_list = ['{0}.{1}'.format(clusterid, procid) for procid in range(n_jobs)]
             tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list)))
         else:
             errStr = 'no job submitted: {0}'.format(errStr)
             tmpLog.error(errStr)
     else:
         tmpLog.error('submission failed: {0} ; {1}'.format(stdErr, errStr))
     # Return
     return (batchIDs_list, errStr)
예제 #29
0
 def ack_commands(self, command_ids):
     harvester_id = harvester_config.master.harvester_id
     tmpLog = self.make_logger('harvesterID={0}'.format(harvester_id),
                               method_name='ack_commands')
     tmpLog.debug('Start acknowledging {0} commands (command_ids={1})'.format(len(command_ids), command_ids))
     data = {}
     data['command_ids'] = json.dumps(command_ids)
     tmp_stat, tmp_res = self.post_ssl('ackCommands', data)
     if tmp_stat is False:
         core_utils.dump_error_message(tmpLog, tmp_res)
     else:
         try:
             tmp_dict = tmp_res.json()
             if tmp_dict['StatusCode'] == 0:
                 tmpLog.debug('Finished acknowledging commands')
                 return True
             return False
         except KeyError:
             core_utils.dump_error_message(tmpLog, tmp_res)
     return False
 def setup_access_points(self, workspec_list):
     try:
         for workSpec in workspec_list:
             accessPoint = workSpec.get_access_point()
             # make the dir if missing
             if not os.path.exists(accessPoint):
                 os.makedirs(accessPoint)
             jobSpecs = workSpec.get_jobspec_list()
             if jobSpecs is not None:
                 for jobSpec in jobSpecs:
                     subAccessPoint = self.get_access_point(workSpec, jobSpec.PandaID)
                     if accessPoint != subAccessPoint:
                         if not os.path.exists(subAccessPoint):
                             os.mkdir(subAccessPoint)
         return True
     except Exception:
         # get logger
         tmpLog = core_utils.make_logger(_logger, method_name='setup_access_points')
         core_utils.dump_error_message(tmpLog)
         return False
예제 #31
0
 def execute(self):
     # avoid too early check
     if not self.singleMode and datetime.datetime.utcnow() - self.startTime \
             < datetime.timedelta(seconds=harvester_config.watcher.checkInterval):
         return
     mainLog = core_utils.make_logger(_logger,
                                      'id={0}'.format(self.get_pid()),
                                      method_name='execute')
     mainLog.debug('start')
     # get file lock
     try:
         with core_utils.get_file_lock(
                 lockFileName, harvester_config.watcher.checkInterval):
             logFileName = os.path.join(logDir, 'panda-db_proxy.log')
             timeNow = datetime.datetime.utcnow()
             if os.path.exists(logFileName):
                 # get latest timestamp
                 try:
                     p = subprocess.Popen(['tail', '-1', logFileName],
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
                     line = p.stdout.readline()
                     lastTime = datetime.datetime.strptime(
                         line[:23], "%Y-%m-%d %H:%M:%S,%f")
                 except Exception:
                     lastTime = None
                 # get processing time for last 1000 queries
                 logDuration = None
                 try:
                     p = subprocess.Popen('tail -{0} {1} | head -1'.format(
                         harvester_config.watcher.nMessages, logFileName),
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          shell=True)
                     line = p.stdout.readline()
                     firstTime = datetime.datetime.strptime(
                         line[:23], "%Y-%m-%d %H:%M:%S,%f")
                     if lastTime is not None:
                         logDuration = lastTime - firstTime
                 except Exception:
                     pass
                 tmpMsg = 'last log message at {0}. '.format(lastTime)
                 if logDuration is not None:
                     tmpMsg += '{0} messages took {1} sec'.format(
                         harvester_config.watcher.nMessages,
                         logDuration.total_seconds())
                 mainLog.debug(tmpMsg)
                 # check timestamp
                 doAction = False
                 if harvester_config.watcher.maxStalled > 0 and lastTime is not None and \
                         timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled):
                     mainLog.warning(
                         'last log message is too old. seems to be stalled')
                     doAction = True
                 elif harvester_config.watcher.maxDuration > 0 and logDuration is not None and \
                         logDuration.total_seconds() > harvester_config.watcher.maxDuration:
                     mainLog.warning(
                         'slow message generation. seems to be a performance issue'
                     )
                     doAction = True
                 # take action
                 if doAction:
                     # email
                     if 'email' in harvester_config.watcher.actions.split(
                             ','):
                         # get pass phrase
                         toSkip = False
                         mailUser = None
                         mailPass = None
                         if harvester_config.watcher.mailUser != '' and \
                                 harvester_config.watcher.mailPassword != '':
                             envName = harvester_config.watcher.passphraseEnv
                             if envName not in os.environ:
                                 tmpMsg = '{0} is undefined in etc/sysconfig/panda_harvester'.format(
                                     envName)
                                 mainLog.error(tmpMsg)
                                 toSkip = True
                             else:
                                 key = os.environ[envName]
                                 mailUser = core_utils.decrypt_string(
                                     key, harvester_config.watcher.mailUser)
                                 mailPass = core_utils.decrypt_string(
                                     key,
                                     harvester_config.watcher.mailPassword)
                         if not toSkip:
                             # message
                             msgBody = 'harvester {0} '.format(
                                 harvester_config.master.harvester_id)
                             msgBody += 'is having a problem on {0} '.format(
                                 socket.getfqdn())
                             msgBody += 'at {0} (UTC)'.format(
                                 datetime.datetime.utcnow())
                             message = MIMEText(msgBody)
                             message['Subject'] = "Harvester Alarm"
                             message[
                                 'From'] = harvester_config.watcher.mailFrom
                             message['To'] = harvester_config.watcher.mailTo
                             # send email
                             mainLog.debug('sending email to {0}'.format(
                                 harvester_config.watcher.mailTo))
                             server = smtplib.SMTP(
                                 harvester_config.watcher.mailServer,
                                 harvester_config.watcher.mailPort)
                             if hasattr(harvester_config.watcher, 'mailUseSSL') and \
                                     harvester_config.watcher.mailUseSSL is True:
                                 server.starttls()
                             if mailUser is not None and mailPass is not None:
                                 server.login(mailUser, mailPass)
                             server.ehlo()
                             server.sendmail(
                                 harvester_config.watcher.mailFrom,
                                 harvester_config.watcher.mailTo.split(','),
                                 message.as_string())
                             server.quit()
                     # kill
                     if 'kill' in harvester_config.watcher.actions.split(
                             ','):
                         # send USR2 fist
                         mainLog.debug('sending SIGUSR2')
                         os.killpg(os.getpgrp(), signal.SIGUSR2)
                         time.sleep(60)
                         mainLog.debug('sending SIGKILL')
                         os.killpg(os.getpgrp(), signal.SIGKILL)
             else:
                 mainLog.debug('skip as {0} is missing'.format(logFileName))
     except IOError:
         mainLog.debug(
             'skip as locked by another thread or too early to check')
     except Exception:
         core_utils.dump_error_message(mainLog)
     mainLog.debug('done')
예제 #32
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         sw_main = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         # get commands to kill
         sw_getcomm = core_utils.get_stopwatch()
         mainLog.debug('try to get commands')
         comStr = CommandSpec.COM_killWorkers
         commandSpecs = self.dbProxy.get_commands_for_receiver(
             'sweeper', comStr)
         mainLog.debug('got {0} {1} commands'.format(
             len(commandSpecs), comStr))
         for commandSpec in commandSpecs:
             n_to_kill = self.dbProxy.kill_workers_by_query(
                 commandSpec.params)
             mainLog.debug('will kill {0} workers with {1}'.format(
                 n_to_kill, commandSpec.params))
         mainLog.debug('done handling commands' +
                       sw_getcomm.get_elapsed_time())
         # killing stage
         sw_kill = core_utils.get_stopwatch()
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(
             harvester_config.sweeper.maxWorkers,
             harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(
             len(workersToKill)))
         # loop over all workers
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersToKill):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 try:
                     sweeperCore = self.pluginFactory.get_plugin(
                         queueConfig.sweeper)
                 except Exception:
                     mainLog.error(
                         'failed to launch sweeper plugin for {0}/{1}'.
                         format(queueName, configID))
                     core_utils.dump_error_message(mainLog)
                     continue
                 sw.reset()
                 n_workers = len(workspec_list)
                 try:
                     # try bulk method
                     tmpLog = self.make_logger(_logger,
                                               'id={0}'.format(lockedBy),
                                               method_name='run')
                     tmpLog.debug('start killing')
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpLog.debug('start killing one worker')
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                             tmpLog.debug(
                                 'done killing with status={0} diag={1}'.
                                 format(tmpStat, tmpOut))
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 else:
                     # bulk method
                     n_killed = 0
                     for workspec, (tmpStat,
                                    tmpOut) in zip(workspec_list, tmpList):
                         tmpLog.debug(
                             'done killing workerID={0} with status={1} diag={2}'
                             .format(workspec.workerID, tmpStat, tmpOut))
                         if tmpStat:
                             n_killed += 1
                     tmpLog.debug('killed {0}/{1} workers'.format(
                         n_killed, n_workers))
                 mainLog.debug(
                     'done killing {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
         # cleanup stage
         sw_cleanup = core_utils.get_stopwatch()
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         try:
             keepPending = harvester_config.sweeper.keepPending
         except Exception:
             keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {
             'finished': harvester_config.sweeper.keepFinished,
             'failed': harvester_config.sweeper.keepFailed,
             'cancelled': harvester_config.sweeper.keepCancelled,
             'missed': keepMissed,
             'pending': keepPending
         }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(
             harvester_config.sweeper.maxWorkers, statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(
             len(workersForCleanup)))
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(
                 workersForCleanup):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(
                     queueConfig.sweeper)
                 messenger = self.pluginFactory.get_plugin(
                     queueConfig.messenger)
                 sw.reset()
                 n_workers = len(workspec_list)
                 # make sure workers to clean up are all terminated
                 mainLog.debug(
                     'making sure workers to clean up are all terminated')
                 try:
                     # try bulk method
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 mainLog.debug(
                     'made sure workers to clean up are all terminated')
                 # start cleanup
                 for workspec in workspec_list:
                     tmpLog = self.make_logger(_logger,
                                               'workerID={0}'.format(
                                                   workspec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleaning up one worker')
                         # sweep worker
                         tmpStat, tmpOut = sweeperCore.sweep_worker(
                             workspec)
                         tmpLog.debug(
                             'swept_worker with status={0} diag={1}'.format(
                                 tmpStat, tmpOut))
                         tmpLog.debug('start messenger cleanup')
                         mc_tmpStat, mc_tmpOut = messenger.clean_up(
                             workspec)
                         tmpLog.debug(
                             'messenger cleaned up with status={0} diag={1}'
                             .format(mc_tmpStat, mc_tmpOut))
                         if tmpStat:
                             self.dbProxy.delete_worker(workspec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
                 mainLog.debug(
                     'done cleaning up {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
         # old-job-deletion stage
         sw_delete = core_utils.get_stopwatch()
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         # delete orphaned job info
         self.dbProxy.delete_orphaned_job_info()
         mainLog.debug('done deletion of old jobs' +
                       sw_delete.get_elapsed_time())
         # disk cleanup
         if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \
                 hasattr(harvester_config.sweeper, 'diskHighWatermark'):
             locked = self.dbProxy.get_process_lock(
                 'sweeper', self.get_pid(),
                 harvester_config.sweeper.diskCleanUpInterval * 60 * 60)
             if locked:
                 try:
                     all_active_files = None
                     for item in harvester_config.sweeper.diskHighWatermark.split(
                             ','):
                         # dir name and watermark in GB
                         dir_name, watermark = item.split('|')
                         mainLog.debug(
                             'checking {0} for cleanup with watermark {1} GB'
                             .format(dir_name, watermark))
                         watermark = int(watermark) * 10**9
                         total_size = 0
                         file_dict = {}
                         # scan dir
                         for root, dirs, filenames in walk(dir_name):
                             for base_name in filenames:
                                 full_name = os.path.join(root, base_name)
                                 f_size = os.path.getsize(full_name)
                                 total_size += f_size
                                 mtime = os.path.getmtime(full_name)
                                 file_dict.setdefault(mtime, set())
                                 file_dict[mtime].add(
                                     (base_name, full_name, f_size))
                         # delete if necessary
                         if total_size < watermark:
                             mainLog.debug(
                                 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB'
                                 .format(dir_name, total_size // (10**9),
                                         watermark // (10**9)))
                         else:
                             mainLog.debug(
                                 'cleanup {0} due to total_size {1} GB >= watermark {2} GB'
                                 .format(dir_name, total_size // (10**9),
                                         watermark // (10**9)))
                             # get active input files
                             if all_active_files is None:
                                 all_active_files = self.dbProxy.get_all_active_input_files(
                                 )
                             deleted_size = 0
                             mtimes = sorted(file_dict.keys())
                             for mtime in mtimes:
                                 for base_name, full_name, f_size in file_dict[
                                         mtime]:
                                     # keep if active
                                     if base_name in all_active_files:
                                         continue
                                     try:
                                         os.remove(full_name)
                                     except Exception:
                                         core_utils.dump_error_message(
                                             mainLog)
                                     deleted_size += f_size
                                     if total_size - deleted_size < watermark:
                                         break
                                 if total_size - deleted_size < watermark:
                                     break
                 except Exception:
                     core_utils.dump_error_message(mainLog)
         # time the cycle
         mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return
예제 #33
0
def submit_a_worker(data):
    workspec = data['workspec']
    to_submit = data['to_submit']
    # make logger
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='submit_a_worker')
    # no need to submit bad worker
    if not to_submit:
        errStr = 'Not submitted, due to incomplete data of the worker'
        tmpLog.warning(errStr)
        tmpRetVal = (None, errStr)
        return tmpRetVal, workspec.get_changed_attributes()
    # attributes
    try:
        ce_info_dict = data['ce_info_dict']
        batch_log_dict = data['batch_log_dict']
        condor_schedd = data['condor_schedd']
        condor_pool = data['condor_pool']
        use_spool = data['use_spool']
    except KeyError:
        errStr = 'Not submitted, due to incomplete data of the worker'
        tmpLog.warning(errStr)
        tmpRetVal = (None, errStr)
        return tmpRetVal, workspec.get_changed_attributes()
    else:
        workspec.reset_changed_list()
    # make batch script
    batchFile = make_batch_script(**data)
    # make condor remote options
    name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else ''
    pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else ''
    spool_opt = '-spool'.format(
        use_spool) if use_spool and condor_schedd else ''
    # command
    comStr = 'condor_submit {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format(
        sdf_file=batchFile,
        name_opt=name_opt,
        pool_opt=pool_opt,
        spool_opt=spool_opt)
    # submit
    tmpLog.debug('submit with command: {0}'.format(comStr))
    try:
        p = subprocess.Popen(comStr.split(),
                             shell=False,
                             universal_newlines=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        # check return code
        stdOut, stdErr = p.communicate()
        retCode = p.returncode
    except Exception:
        stdOut = ''
        stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
        retCode = 1
    tmpLog.debug('retCode={0}'.format(retCode))
    if retCode == 0:
        # extract batchID
        job_id_match = None
        for tmp_line_str in stdOut.split('\n'):
            job_id_match = re.search(
                '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$',
                tmp_line_str)
            if job_id_match:
                break
        if job_id_match is not None:
            workspec.batchID = job_id_match.group(2)
            # set submissionHost
            if not condor_schedd and not condor_pool:
                workspec.submissionHost = None
            else:
                workspec.submissionHost = '{0},{1}'.format(
                    condor_schedd, condor_pool)
            tmpLog.debug('submissionHost={0} batchID={1}'.format(
                workspec.submissionHost, workspec.batchID))
            # set computingElement
            workspec.computingElement = ce_info_dict.get('ce_endpoint', '')
            # set log
            batch_log = _condor_macro_replace(batch_log_dict['batch_log'],
                                              ClusterId=workspec.batchID)
            batch_stdout = _condor_macro_replace(
                batch_log_dict['batch_stdout'], ClusterId=workspec.batchID)
            batch_stderr = _condor_macro_replace(
                batch_log_dict['batch_stderr'], ClusterId=workspec.batchID)
            workspec.set_log_file('batch_log', batch_log)
            workspec.set_log_file('stdout', batch_stdout)
            workspec.set_log_file('stderr', batch_stderr)
            if not workspec.get_jobspec_list():
                tmpLog.debug(
                    'No jobspec associated in the worker of workerID={0}'.
                    format(workspec.workerID))
            else:
                for jobSpec in workspec.get_jobspec_list():
                    # using batchLog and stdOut URL as pilotID and pilotLog
                    jobSpec.set_one_attribute(
                        'pilotID', workspec.workAttributes['stdOut'])
                    jobSpec.set_one_attribute(
                        'pilotLog', workspec.workAttributes['batchLog'])
            tmpLog.debug('Done set_log_file after submission')
            tmpRetVal = (True, '')

        else:
            errStr = 'batchID cannot be found'
            tmpLog.error(errStr)
            tmpRetVal = (None, errStr)
    else:
        # failed
        errStr = '{0} \n {1}'.format(stdOut, stdErr)
        tmpLog.error(errStr)
        tmpRetVal = (None, errStr)
    return tmpRetVal, workspec.get_changed_attributes()
예제 #34
0
 def get_files_to_stage_out(self, workspec):
     # get logger
     tmpLog = core_utils.make_logger(_logger,
                                     'workerID={0}'.format(
                                         workspec.workerID),
                                     method_name='get_files_to_stage_out')
     fileDict = dict()
     # look for the json just under the access point
     for pandaID in workspec.pandaid_list:
         # look for the json just under the access point
         accessPoint = self.get_access_point(workspec, pandaID)
         jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName)
         readJsonPath = jsonFilePath + suffixReadJson
         # first look for json.read which is not yet acknowledged
         tmpLog.debug('looking for output file {0}'.format(readJsonPath))
         if os.path.exists(readJsonPath):
             pass
         else:
             tmpLog.debug(
                 'looking for output file {0}'.format(jsonFilePath))
             if not os.path.exists(jsonFilePath):
                 # not found
                 tmpLog.debug('not found')
                 continue
             try:
                 tmpLog.debug('found')
                 # rename to prevent from being overwritten
                 os.rename(jsonFilePath, readJsonPath)
             except Exception:
                 tmpLog.error('failed to rename json')
                 continue
         # load json
         toSkip = False
         loadDict = None
         try:
             with open(readJsonPath) as jsonFile:
                 loadDict = json.load(jsonFile)
         except Exception:
             tmpLog.error('failed to load json')
             toSkip = True
         # test validity of data format (ie it should be a Dictionary)
         if not toSkip:
             if not isinstance(loadDict, dict):
                 tmpLog.error('loaded data is not a dictionary')
                 toSkip = True
         # collect files and events
         nData = 0
         if not toSkip:
             sizeMap = dict()
             chksumMap = dict()
             eventsList = dict()
             for tmpPandaID, tmpEventMapList in iteritems(loadDict):
                 tmpPandaID = long(tmpPandaID)
                 # test if tmpEventMapList is a list
                 if not isinstance(tmpEventMapList, list):
                     tmpLog.error('loaded data item is not a list')
                     toSkip = True
                     break
                 for tmpEventInfo in tmpEventMapList:
                     try:
                         nData += 1
                         if 'eventRangeID' in tmpEventInfo:
                             tmpEventRangeID = tmpEventInfo['eventRangeID']
                         else:
                             tmpEventRangeID = None
                         tmpFileDict = dict()
                         pfn = tmpEventInfo['path']
                         lfn = os.path.basename(pfn)
                         tmpFileDict['path'] = pfn
                         if pfn not in sizeMap:
                             if 'fsize' in tmpEventInfo:
                                 sizeMap[pfn] = tmpEventInfo['fsize']
                             else:
                                 sizeMap[pfn] = os.stat(pfn).st_size
                         tmpFileDict['fsize'] = sizeMap[pfn]
                         tmpFileDict['type'] = tmpEventInfo['type']
                         if tmpEventInfo['type'] in ['log', 'output']:
                             # disable zipping
                             tmpFileDict['isZip'] = 0
                         elif tmpEventInfo['type'] == 'zip_output':
                             # already zipped
                             tmpFileDict['isZip'] = 1
                         elif 'isZip' in tmpEventInfo:
                             tmpFileDict['isZip'] = tmpEventInfo['isZip']
                         # guid
                         if 'guid' in tmpEventInfo:
                             tmpFileDict['guid'] = tmpEventInfo['guid']
                         else:
                             tmpFileDict['guid'] = str(uuid.uuid4())
                         # get checksum
                         if pfn not in chksumMap:
                             if 'chksum' in tmpEventInfo:
                                 chksumMap[pfn] = tmpEventInfo['chksum']
                             else:
                                 chksumMap[pfn] = core_utils.calc_adler32(
                                     pfn)
                         tmpFileDict['chksum'] = chksumMap[pfn]
                         if tmpPandaID not in fileDict:
                             fileDict[tmpPandaID] = dict()
                         if lfn not in fileDict[tmpPandaID]:
                             fileDict[tmpPandaID][lfn] = []
                         fileDict[tmpPandaID][lfn].append(tmpFileDict)
                         # skip if unrelated to events
                         if tmpFileDict['type'] not in [
                                 'es_output', 'zip_output'
                         ]:
                             continue
                         tmpFileDict['eventRangeID'] = tmpEventRangeID
                         if tmpPandaID not in eventsList:
                             eventsList[tmpPandaID] = list()
                         eventsList[tmpPandaID].append({
                             'eventRangeID':
                             tmpEventRangeID,
                             'eventStatus':
                             tmpEventInfo['eventStatus']
                         })
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
             # dump events
             if not toSkip:
                 if len(eventsList) > 0:
                     curName = os.path.join(accessPoint,
                                            jsonEventsUpdateFileName)
                     newName = curName + '.new'
                     f = open(newName, 'w')
                     json.dump(eventsList, f)
                     f.close()
                     os.rename(newName, curName)
         # remove empty file
         if toSkip or nData == 0:
             try:
                 os.remove(readJsonPath)
             except Exception:
                 pass
         tmpLog.debug('got {0} files for PandaID={1}'.format(
             nData, pandaID))
     return fileDict
예제 #35
0
    def ssh_zip_output(self, jobspec, tmp_log):
        tmp_log.debug('start')
        self.zip_tmp_log = tmp_log
        self.zip_jobSpec = jobspec
        argDictList = []
        outFiles_list = list(jobspec.outFiles)
        try:
            try:
                if hasattr(harvester_config, 'zipper'):
                    nThreadsForZip = harvester_config.zipper.nThreadsForZip
                else:
                    nThreadsForZip = harvester_config.stager.nThreadsForZip
            except Exception:
                nThreadsForZip = multiprocessing.cpu_count()
            # check associate file existence
            def _check_assfile_existence(fileSpec):
                in_data = '\\n'.join([
                    '{0}'.format(assFileSpec.path)
                    for assFileSpec in fileSpec.associatedFiles
                ])
                com1 = (
                    'ssh '
                    '-o StrictHostKeyChecking=no '
                    '-i {sshkey} '
                    '{userhost} '
                    '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "'
                ).format(
                    sshkey=self.sshkey,
                    userhost=self.userhost,
                    fileop_script=self.fileop_script,
                    suffix='_check-exist.tmp',
                    dir=os.path.dirname(
                        next(iter(fileSpec.associatedFiles)).path),
                    data=in_data,
                )
                # execute
                p1 = subprocess.Popen(com1,
                                      shell=True,
                                      close_fds=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
                stdOut, stdErr = p1.communicate()
                retCode = p1.returncode
                if retCode != 0:
                    msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format(
                        stdOut, stdErr)
                    tmp_log.error(msgStr)
                    return False, 'failed to zip with {0}'.format(msgStr)
                stdOut_str = stdOut if (isinstance(stdOut, str) or
                                        stdOut is None) else stdOut.decode()
                tmpargfile_name = stdOut_str.strip('\n')
                del p1, stdOut, stdErr
                # record set
                existence_set = set()
                # make command
                com2 = (
                    'ssh '
                    '-o StrictHostKeyChecking=no '
                    '-i {sshkey} '
                    '{userhost} '
                    '"cat {arg_file} | xargs -I%% sh -c \' test -f %% && echo T || echo F \' " '
                ).format(
                    sshkey=self.sshkey,
                    userhost=self.userhost,
                    arg_file=tmpargfile_name,
                )
                # execute
                p2 = subprocess.Popen(com2,
                                      shell=True,
                                      close_fds=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
                stdOut, stdErr = p2.communicate()
                retCode = p2.returncode
                if retCode != 0:
                    msgStr = 'failed to existence of associate files with {0}:{1}'.format(
                        stdOut, stdErr)
                    tmp_log.error(msgStr)
                else:
                    try:
                        stdOut_str = stdOut if (
                            isinstance(stdOut, str)
                            or stdOut is None) else stdOut.decode()
                        ret_list = stdOut_str.strip('\n').split('\n')
                        if len(fileSpec.associatedFiles) == len(ret_list):
                            for (assFileSpec,
                                 retVal) in zip(fileSpec.associatedFiles,
                                                ret_list):
                                if retVal == 'T':
                                    existence_set.add(assFileSpec.path)
                        else:
                            msgStr = 'returned number of files inconsistent! Skipped...'
                            tmp_log.error(msgStr)
                    except Exception:
                        core_utils.dump_error_message(tmp_log)
                del p2, stdOut, stdErr, com2
                # delete tmpargfile
                com3 = ('ssh '
                        '-o StrictHostKeyChecking=no '
                        '-i {sshkey} '
                        '{userhost} '
                        '"{fileop_script} remove_file {file_path} "').format(
                            sshkey=self.sshkey,
                            userhost=self.userhost,
                            fileop_script=self.fileop_script,
                            file_path=tmpargfile_name,
                        )
                # execute
                p3 = subprocess.Popen(com3,
                                      shell=True,
                                      close_fds=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
                stdOut, stdErr = p3.communicate()
                retCode = p3.returncode
                if retCode != 0:
                    msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format(
                        stdOut, stdErr)
                    tmp_log.error(msgStr)
                del p3, stdOut, stdErr
                gc.collect()
                return existence_set

            # parallel execution of check existence
            with Pool(max_workers=nThreadsForZip) as pool:
                existence_set_list = pool.map(_check_assfile_existence,
                                              outFiles_list)
            # loop
            for fileSpec, existence_set in zip(outFiles_list,
                                               existence_set_list):
                if self.zipDir == "${SRCDIR}":
                    # the same directory as src
                    zipDir = os.path.dirname(
                        next(iter(fileSpec.associatedFiles)).path)
                elif self.zipDir == "${WORKDIR}":
                    # work dir
                    workSpec = jobspec.get_workspec_list()[0]
                    zipDir = workSpec.get_access_point()
                else:
                    zipDir = self.zipDir
                zipPath = os.path.join(zipDir, fileSpec.lfn)
                argDict = dict()
                argDict['zipPath'] = zipPath
                argDict['associatedFiles'] = []
                # check existence of files
                for assFileSpec in fileSpec.associatedFiles:
                    if assFileSpec.path in existence_set:
                        argDict['associatedFiles'].append(assFileSpec.path)
                    else:
                        assFileSpec.status = 'failed'
                # append
                argDictList.append(argDict)
            # parallel execution of zip
            with Pool(max_workers=nThreadsForZip) as pool:
                retValList = pool.map(self.ssh_make_one_zip, argDictList)
                # check returns
                for fileSpec, retVal in zip(jobspec.outFiles, retValList):
                    tmpRet, errMsg, fileInfo = retVal
                    if tmpRet is True:
                        # set path
                        fileSpec.path = fileInfo['path']
                        fileSpec.fsize = fileInfo['fsize']
                        fileSpec.chksum = fileInfo['chksum']
                        msgStr = 'fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}' \
                            .format(fileSpec.path, fileSpec.fsize, fileSpec.chksum)
                        tmp_log.debug(msgStr)
                    else:
                        tmp_log.error(
                            'got {0} with {1} when zipping {2}'.format(
                                tmpRet, errMsg, fileSpec.lfn))
                        return tmpRet, 'failed to zip with {0}'.format(errMsg)
        except Exception:
            errMsg = core_utils.dump_error_message(tmp_log)
            return False, 'failed to zip with {0}'.format(errMsg)
        tmp_log.debug('done')
        return True, ''
예제 #36
0
    def check_workers(self, mon_core, messenger, all_workers, queue_config,
                      tmp_log):
        workersToCheck = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            # job-level late binding
            if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                # check if job is requested
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    # set ready when job is requested
                    workStatus = WorkSpec.ST_ready
                else:
                    workStatus = workSpec.status
            elif workSpec.nJobsToReFill in [0, None]:
                # check if job is requested to refill free slots
                jobRequested = messenger.job_requested(workSpec)
                if jobRequested:
                    nJobsToReFill = jobRequested
                workersToCheck.append(workSpec)
            else:
                workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {
                'newStatus': workStatus,
                'monStatus': workStatus,
                'workAttributes': workAttributes,
                'filesToStageOut': filesToStageOut,
                'eventsRequestParams': eventsRequestParams,
                'eventsToUpdate': eventsToUpdate,
                'diagMessage': '',
                'pandaIDs': pandaIDs,
                'nJobsToReFill': nJobsToReFill
            }
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
            if not tmpStat:
                tmp_log.error(
                    'failed to check workers with: {0}'.format(tmpOut))
            else:
                tmp_log.debug('checked')
                for workSpec, (newStatus,
                               diagMessage) in zip(workersToCheck, tmpOut):
                    workerID = workSpec.workerID
                    tmp_log.debug(
                        'Going to check workerID={0}'.format(workerID))
                    pandaIDs = []
                    if workerID in retMap:
                        # request kill
                        if messenger.kill_requested(workSpec):
                            self.dbProxy.kill_worker(workSpec.workerID)

                        # expired heartbeat - only when requested in the configuration
                        try:
                            # check if the queue configuration requires checking for worker heartbeat
                            worker_heartbeat_limit = int(
                                queue_config.messenger['worker_heartbeat'])
                        except (AttributeError, KeyError):
                            worker_heartbeat_limit = None
                        tmp_log.debug(
                            'workerID={0} heartbeat limit is configured to {1}'
                            .format(workerID, worker_heartbeat_limit))
                        if worker_heartbeat_limit:
                            if messenger.is_alive(workSpec,
                                                  worker_heartbeat_limit):
                                tmp_log.debug(
                                    'heartbeat for workerID={0} is valid'.
                                    format(workerID))
                            else:
                                tmp_log.debug(
                                    'heartbeat for workerID={0} expired: sending kill request'
                                    .format(workerID))
                                self.dbProxy.kill_worker(workSpec.workerID)

                        # get work attributes
                        workAttributes = messenger.get_work_attributes(
                            workSpec)
                        retMap[workerID]['workAttributes'] = workAttributes
                        # get output files
                        filesToStageOut = messenger.get_files_to_stage_out(
                            workSpec)
                        retMap[workerID]['filesToStageOut'] = filesToStageOut
                        # get events to update
                        if workSpec.eventsRequest in [
                                WorkSpec.EV_useEvents,
                                WorkSpec.EV_requestEvents
                        ]:
                            eventsToUpdate = messenger.events_to_update(
                                workSpec)
                            retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                        # request events
                        if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                            eventsRequestParams = messenger.events_requested(
                                workSpec)
                            retMap[workerID][
                                'eventsRequestParams'] = eventsRequestParams
                        # get PandaIDs for pull model
                        if workSpec.mapType == WorkSpec.MT_NoJob:
                            pandaIDs = messenger.get_panda_ids(workSpec)
                        retMap[workerID]['pandaIDs'] = pandaIDs
                        # keep original new status
                        retMap[workerID]['monStatus'] = newStatus
                        # set running while there are events to update or files to stage out
                        if newStatus in [
                                WorkSpec.ST_finished, WorkSpec.ST_failed,
                                WorkSpec.ST_cancelled
                        ]:
                            if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                            len(retMap[workerID]['eventsToUpdate']) > 0:
                                newStatus = WorkSpec.ST_running
                            elif not workSpec.is_post_processed():
                                if not queue_config.is_no_heartbeat_status(
                                        newStatus):
                                    # post processing unless heartbeat is suppressed
                                    jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                                        workSpec.workerID,
                                        None,
                                        True,
                                        only_running=True)
                                    # post processing
                                    messenger.post_processing(
                                        workSpec, jobSpecs, workSpec.mapType)
                                workSpec.post_processed()
                                newStatus = WorkSpec.ST_running
                            # reset modification time to immediately trigger subsequent lookup
                            workSpec.trigger_next_lookup()
                        retMap[workerID]['newStatus'] = newStatus
                        retMap[workerID]['diagMessage'] = diagMessage
                    else:
                        tmp_log.debug(
                            'workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except:
            core_utils.dump_error_message(tmp_log)
            return False, None
예제 #37
0
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.get_pid())
        monitor_fifo = self.monitor_fifo
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            submitted = False
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} {1} commands'.format(
                    commandSpecs, comStr))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = self.make_logger(
                                _logger,
                                'id={0} queue={1} rtype={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            try:
                                tmpLog.debug('start')
                                tmpLog.debug('workers status: %s' % tmpVal)
                                nWorkers = tmpVal['nNewWorkers'] + tmpVal[
                                    'nReady']
                                nReady = tmpVal['nReady']

                                # check queue
                                if not self.queueConfigMapper.has_queue(
                                        queueName):
                                    tmpLog.error('config not found')
                                    continue

                                # no new workers
                                if nWorkers == 0:
                                    tmpLog.debug(
                                        'skipped since no new worker is needed based on current stats'
                                    )
                                    continue
                                # get queue
                                queueConfig = self.queueConfigMapper.get_queue(
                                    queueName)
                                workerMakerCore = self.workerMaker.get_plugin(
                                    queueConfig)
                                # check if resource is ready
                                if hasattr(
                                        workerMakerCore, 'dynamicSizing'
                                ) and workerMakerCore.dynamicSizing is True:
                                    numReadyResources = self.workerMaker.num_ready_resources(
                                        queueConfig, resource_type,
                                        workerMakerCore)
                                    tmpLog.debug('numReadyResources: %s' %
                                                 numReadyResources)
                                    if not numReadyResources:
                                        if hasattr(workerMakerCore,
                                                   'staticWorkers'):
                                            nQRWorkers = tmpVal[
                                                'nQueue'] + tmpVal['nRunning']
                                            tmpLog.debug(
                                                'staticWorkers: %s, nQRWorkers(Queue+Running): %s'
                                                %
                                                (workerMakerCore.staticWorkers,
                                                 nQRWorkers))
                                            if nQRWorkers >= workerMakerCore.staticWorkers:
                                                tmpLog.debug(
                                                    'No left static workers, skip'
                                                )
                                                continue
                                            else:
                                                nWorkers = min(
                                                    workerMakerCore.
                                                    staticWorkers - nQRWorkers,
                                                    nWorkers)
                                                tmpLog.debug(
                                                    'staticWorkers: %s, nWorkers: %s'
                                                    %
                                                    (workerMakerCore.
                                                     staticWorkers, nWorkers))
                                        else:
                                            tmpLog.debug(
                                                'skip since no resources are ready'
                                            )
                                            continue
                                    else:
                                        nWorkers = min(nWorkers,
                                                       numReadyResources)
                                # post action of worker maker
                                if hasattr(
                                        workerMakerCore, 'skipOnFail'
                                ) and workerMakerCore.skipOnFail is True:
                                    skipOnFail = True
                                else:
                                    skipOnFail = False
                                # actions based on mapping type
                                if queueConfig.mapType == WorkSpec.MT_NoJob:
                                    # workers without jobs
                                    jobChunks = []
                                    for i in range(nWorkers):
                                        jobChunks.append([])
                                elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                    # one worker per one job
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady, 1, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy)
                                elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                    # one worker for multiple jobs
                                    nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nJobsPerWorker={0}'.format(
                                        nJobsPerWorker))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName, nWorkers, nReady,
                                        nJobsPerWorker, None,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval, harvester_config.
                                        submitter.lockInterval, lockedBy,
                                        queueConfig.allowJobMixture)
                                elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                    # multiple workers for one job
                                    nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                        queueConfig,
                                        nWorkers,
                                        resource_type,
                                        maker=workerMakerCore)
                                    maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total(
                                        queueConfig,
                                        resource_type,
                                        maker=workerMakerCore)
                                    maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle(
                                        queueConfig,
                                        resource_type,
                                        maker=workerMakerCore)
                                    tmpLog.debug('nWorkersPerJob={0}'.format(
                                        nWorkersPerJob))
                                    jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                        queueName,
                                        nWorkers,
                                        nReady,
                                        None,
                                        nWorkersPerJob,
                                        queueConfig.useJobLateBinding,
                                        harvester_config.submitter.
                                        checkInterval,
                                        harvester_config.submitter.
                                        lockInterval,
                                        lockedBy,
                                        max_workers_per_job_in_total=
                                        maxWorkersPerJob,
                                        max_workers_per_job_per_cycle=
                                        maxWorkersPerJobPerCycle)
                                else:
                                    tmpLog.error('unknown mapType={0}'.format(
                                        queueConfig.mapType))
                                    continue

                                tmpLog.debug('got {0} job chunks'.format(
                                    len(jobChunks)))
                                if len(jobChunks) == 0:
                                    continue
                                # make workers
                                okChunks, ngChunks = self.workerMaker.make_workers(
                                    jobChunks,
                                    queueConfig,
                                    nReady,
                                    resource_type,
                                    maker=workerMakerCore)
                                if len(ngChunks) == 0:
                                    tmpLog.debug(
                                        'successfully made {0} workers'.format(
                                            len(okChunks)))
                                else:
                                    tmpLog.debug(
                                        'made {0} workers, while {1} workers failed'
                                        .format(len(okChunks), len(ngChunks)))
                                timeNow = datetime.datetime.utcnow()
                                timeNow_timestamp = time.time()
                                pandaIDs = set()
                                # NG (=not good)
                                for ngJobs in ngChunks:
                                    for jobSpec in ngJobs:
                                        if skipOnFail:
                                            # release jobs when workers are not made
                                            pandaIDs.add(jobSpec.PandaID)
                                        else:
                                            jobSpec.status = 'failed'
                                            jobSpec.subStatus = 'failed_to_make'
                                            jobSpec.stateChangeTime = timeNow
                                            jobSpec.lockedBy = None
                                            errStr = 'failed to make a worker'
                                            jobSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            jobSpec.trigger_propagation()
                                            self.dbProxy.update_job(
                                                jobSpec, {
                                                    'lockedBy': lockedBy,
                                                    'subStatus': 'prepared'
                                                })
                                # OK
                                workSpecList = []
                                if len(okChunks) > 0:
                                    for workSpec, okJobs in okChunks:
                                        # has job
                                        if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                                or queueConfig.mapType == WorkSpec.MT_NoJob:
                                            workSpec.hasJob = 0
                                        else:
                                            workSpec.hasJob = 1
                                            if workSpec.nJobsToReFill in [
                                                    None, 0
                                            ]:
                                                workSpec.set_jobspec_list(
                                                    okJobs)
                                            else:
                                                # refill free slots during the worker is running
                                                workSpec.set_jobspec_list(
                                                    okJobs[:workSpec.
                                                           nJobsToReFill])
                                                workSpec.nJobsToReFill = None
                                                for jobSpec in okJobs[
                                                        workSpec.
                                                        nJobsToReFill:]:
                                                    pandaIDs.add(
                                                        jobSpec.PandaID)
                                            workSpec.set_num_jobs_with_list()
                                        # map type
                                        workSpec.mapType = queueConfig.mapType
                                        # queue name
                                        workSpec.computingSite = queueConfig.queueName
                                        # set access point
                                        workSpec.accessPoint = queueConfig.messenger[
                                            'accessPoint']
                                        # sync level
                                        workSpec.syncLevel = queueConfig.get_synchronization_level(
                                        )
                                        # events
                                        if len(okJobs) > 0 and \
                                                ('eventService' in okJobs[0].jobParams or
                                                 'cloneJob' in okJobs[0].jobParams):
                                            workSpec.eventsRequest = WorkSpec.EV_useEvents
                                        workSpecList.append(workSpec)
                                if len(workSpecList) > 0:
                                    sw = core_utils.get_stopwatch()
                                    # get plugin for submitter
                                    submitterCore = self.pluginFactory.get_plugin(
                                        queueConfig.submitter)
                                    if submitterCore is None:
                                        # not found
                                        tmpLog.error(
                                            'submitter plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # get plugin for messenger
                                    messenger = self.pluginFactory.get_plugin(
                                        queueConfig.messenger)
                                    if messenger is None:
                                        # not found
                                        tmpLog.error(
                                            'messenger plugin for {0} not found'
                                            .format(jobSpec.computingSite))
                                        continue
                                    # setup access points
                                    messenger.setup_access_points(workSpecList)
                                    # feed jobs
                                    for workSpec in workSpecList:
                                        if workSpec.hasJob == 1:
                                            tmpStat = messenger.feed_jobs(
                                                workSpec,
                                                workSpec.get_jobspec_list())
                                            if tmpStat is False:
                                                tmpLog.error(
                                                    'failed to send jobs to workerID={0}'
                                                    .format(workSpec.workerID))
                                            else:
                                                tmpLog.debug(
                                                    'sent jobs to workerID={0} with {1}'
                                                    .format(
                                                        workSpec.workerID,
                                                        tmpStat))
                                    # insert workers
                                    self.dbProxy.insert_workers(
                                        workSpecList, lockedBy)
                                    # submit
                                    sw.reset()
                                    tmpLog.info(
                                        'submitting {0} workers'.format(
                                            len(workSpecList)))
                                    workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                        submitterCore, workSpecList)
                                    tmpLog.debug('done submitting {0} workers'.
                                                 format(len(workSpecList)) +
                                                 sw.get_elapsed_time())
                                    # collect successful jobs
                                    okPandaIDs = set()
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        if tmpRet:
                                            workSpec, jobList = okChunks[
                                                iWorker]
                                            jobList = workSpec.get_jobspec_list(
                                            )
                                            if jobList is not None:
                                                for jobSpec in jobList:
                                                    okPandaIDs.add(
                                                        jobSpec.PandaID)
                                    # loop over all workers
                                    for iWorker, (tmpRet, tmpStr) in enumerate(
                                            zip(tmpRetList, tmpStrList)):
                                        workSpec, jobList = okChunks[iWorker]
                                        # set harvesterHost
                                        workSpec.harvesterHost = socket.gethostname(
                                        )
                                        # use associated job list since it can be truncated for re-filling
                                        jobList = workSpec.get_jobspec_list()
                                        # set status
                                        if not tmpRet:
                                            # failed submission
                                            errStr = 'failed to submit a workerID={0} with {1}'.format(
                                                workSpec.workerID, tmpStr)
                                            tmpLog.error(errStr)
                                            workSpec.set_status(
                                                WorkSpec.ST_missed)
                                            workSpec.set_dialog_message(tmpStr)
                                            workSpec.set_pilot_error(
                                                PilotErrors.ERR_SETUPFAILURE,
                                                errStr)
                                            if jobList is not None:
                                                # increment attempt number
                                                newJobList = []
                                                for jobSpec in jobList:
                                                    # skip if successful with another worker
                                                    if jobSpec.PandaID in okPandaIDs:
                                                        continue
                                                    if jobSpec.submissionAttempts is None:
                                                        jobSpec.submissionAttempts = 0
                                                    jobSpec.submissionAttempts += 1
                                                    # max attempt or permanent error
                                                    if tmpRet is False or \
                                                            jobSpec.submissionAttempts >= \
                                                            queueConfig.maxSubmissionAttempts:
                                                        newJobList.append(
                                                            jobSpec)
                                                    else:
                                                        self.dbProxy.increment_submission_attempt(
                                                            jobSpec.PandaID,
                                                            jobSpec.
                                                            submissionAttempts)
                                                jobList = newJobList
                                        elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                            # directly go to running after feeding jobs for late biding
                                            workSpec.set_status(
                                                WorkSpec.ST_running)
                                        else:
                                            # normal successful submission
                                            workSpec.set_status(
                                                WorkSpec.ST_submitted)
                                        workSpec.submitTime = timeNow
                                        workSpec.modificationTime = timeNow
                                        workSpec.checkTime = timeNow
                                        if self.monitor_fifo.enabled:
                                            workSpec.set_work_params({
                                                'lastCheckAt':
                                                timeNow_timestamp
                                            })
                                        # prefetch events
                                        if tmpRet and workSpec.hasJob == 1 and \
                                                workSpec.eventsRequest == WorkSpec.EV_useEvents and \
                                                queueConfig.prefetchEvents:
                                            workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                            eventsRequestParams = dict()
                                            for jobSpec in jobList:
                                                eventsRequestParams[jobSpec.PandaID] = \
                                                    {'pandaID': jobSpec.PandaID,
                                                     'taskID': jobSpec.taskID,
                                                     'jobsetID': jobSpec.jobParams['jobsetID'],
                                                     'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))),
                                                                    jobSpec.jobParams['coreCount']),
                                                     }
                                            workSpec.eventsRequestParams = eventsRequestParams
                                        # register worker
                                        tmpStat = self.dbProxy.register_worker(
                                            workSpec, jobList, lockedBy)
                                        if jobList is not None:
                                            for jobSpec in jobList:
                                                pandaIDs.add(jobSpec.PandaID)
                                                if tmpStat:
                                                    if tmpRet:
                                                        tmpStr = \
                                                            'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                        tmpLog.info(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.
                                                                PandaID,
                                                                workSpec.
                                                                batchID))
                                                    else:
                                                        tmpStr = 'failed to submit a workerID={0} for PandaID={1}'
                                                        tmpLog.error(
                                                            tmpStr.format(
                                                                workSpec.
                                                                workerID,
                                                                jobSpec.PandaID
                                                            ))
                                                else:
                                                    tmpStr = \
                                                        'failed to register a worker for PandaID={0} with batchID={1}'
                                                    tmpLog.error(
                                                        tmpStr.format(
                                                            jobSpec.PandaID,
                                                            workSpec.batchID))
                                    # enqueue to monitor fifo
                                    if self.monitor_fifo.enabled \
                                            and queueConfig.mapType != WorkSpec.MT_MultiWorkers:
                                        workSpecsToEnqueue = \
                                            [[w] for w in workSpecList if w.status
                                             in (WorkSpec.ST_submitted, WorkSpec.ST_running)]
                                        monitor_fifo.put(
                                            (queueName, workSpecsToEnqueue),
                                            time.time() + harvester_config.
                                            monitor.fifoCheckInterval)
                                        mainLog.debug(
                                            'put workers to monitor FIFO')
                                    submitted = True
                                # release jobs
                                self.dbProxy.release_jobs(pandaIDs, lockedBy)
                                tmpLog.info('done')
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
                if submitted and hasattr(harvester_config.submitter,
                                         'minSubmissionInterval'):
                    interval = harvester_config.submitter.minSubmissionInterval
                    if interval > 0:
                        newTime = datetime.datetime.utcnow(
                        ) + datetime.timedelta(seconds=interval)
                        self.dbProxy.update_panda_queue_attribute(
                            'submitTime', newTime, site_name=siteName)

            # time the cycle
            mainLog.debug('done a submitter cycle' +
                          sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return
예제 #38
0
 def make_one_zip(self, arg_dict):
     try:
         zipPath = arg_dict['zipPath']
         lfn = os.path.basename(zipPath)
         self.zip_tmp_log.debug(
             '{0} start zipPath={1} with {2} files'.format(
                 lfn, zipPath, len(arg_dict['associatedFiles'])))
         # make zip if doesn't exist
         if not os.path.exists(zipPath):
             # tmp file names
             tmpZipPath = zipPath + '.' + str(uuid.uuid4())
             tmpZipPathIn = tmpZipPath + '.in'
             with open(tmpZipPathIn, "w") as f:
                 for associatedFile in arg_dict['associatedFiles']:
                     f.write("{0}\n".format(associatedFile))
             # make command
             com = 'tar -c -f {0} -T {1} '.format(tmpZipPath, tmpZipPathIn)
             com += "--transform 's/.*\///' "
             # execute
             p = subprocess.Popen(com,
                                  shell=True,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
             stdOut, stdErr = p.communicate()
             retCode = p.returncode
             if retCode != 0:
                 msgStr = 'failed to make zip for {0} with {1}:{2}'.format(
                     lfn, stdOut, stdErr)
                 self.zip_tmp_log.error(msgStr)
                 return None, msgStr, {}
             # avoid overwriting
             lockName = 'zip.lock.{0}'.format(lfn)
             lockInterval = 60
             tmpStat = False
             # get lock
             for i in range(lockInterval):
                 tmpStat = self.dbInterface.get_object_lock(
                     lockName, lock_interval=lockInterval)
                 if tmpStat:
                     break
                 time.sleep(1)
             # failed to lock
             if not tmpStat:
                 msgStr = 'failed to lock for {0}'.format(lfn)
                 self.zip_tmp_log.error(msgStr)
                 return None, msgStr
             if not os.path.exists(zipPath):
                 os.rename(tmpZipPath, zipPath)
             # release lock
             self.dbInterface.release_object_lock(lockName)
         # make return
         fileInfo = dict()
         fileInfo['path'] = zipPath
         # get size
         statInfo = os.stat(zipPath)
         fileInfo['fsize'] = statInfo.st_size
         fileInfo['chksum'] = core_utils.calc_adler32(zipPath)
     except Exception:
         errMsg = core_utils.dump_error_message(self.zip_tmp_log)
         return False, 'failed to zip with {0}'.format(errMsg)
     self.zip_tmp_log.debug('{0} done'.format(lfn))
     return True, '', fileInfo
예제 #39
0
 def run(self):
     lockedBy = 'stager-{0}'.format(self.get_pid())
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         mainLog.debug('try to get jobs to check')
         # get jobs to check preparation
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck
         except Exception:
             maxFilesPerJob = None
         jobsToCheck = self.dbProxy.get_jobs_for_stage_out(
             harvester_config.stager.maxJobsToCheck,
             harvester_config.stager.checkInterval,
             harvester_config.stager.lockInterval,
             lockedBy,
             'transferring',
             JobSpec.HO_hasTransfer,
             max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck)))
         # loop over all jobs
         for jobSpec in jobsToCheck:
             tmpLog = self.make_logger(_logger,
                                       'PandaID={0}'.format(
                                           jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('start checking')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(
                         jobSpec.computingSite, configID):
                     tmpLog.error(
                         'queue config for {0}/{1} not found'.format(
                             jobSpec.computingSite, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(
                     queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(
                         jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(
                     jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 tmpStat, tmpStr = stagerCore.check_status(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     newSubStatus = self.dbProxy.update_job_for_stage_out(
                         jobSpec, True, lockedBy)
                     tmpLog.debug(
                         'succeeded new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug(
                         'fatal error when checking status with {0}'.format(
                             tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED,
                                             errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(
                         jobSpec, True, lockedBy)
                     tmpLog.debug(
                         'updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # on-going
                     tmpLog.debug(
                         'try to check later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to trigger stage-out
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger
         except Exception:
             maxFilesPerJob = None
         jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(
             harvester_config.stager.maxJobsToTrigger,
             harvester_config.stager.triggerInterval,
             harvester_config.stager.lockInterval,
             lockedBy,
             'to_transfer',
             JobSpec.HO_hasOutput,
             JobSpec.HO_hasZipOutput,
             max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger)))
         # loop over all jobs
         for jobSpec in jobsToTrigger:
             tmpLog = self.make_logger(_logger,
                                       'PandaID={0}'.format(
                                           jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to trigger stage-out')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(
                         jobSpec.computingSite, configID):
                     tmpLog.error(
                         'queue config for {0}/{1} not found'.format(
                             jobSpec.computingSite, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(
                     queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(
                         jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(
                     jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger stage-out
                 tmpStat, tmpStr = stagerCore.trigger_stage_out(jobSpec)
                 # check result
                 if tmpStat is True:
                     # succeeded
                     jobSpec.all_files_triggered_to_stage_out()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(
                         jobSpec, True, lockedBy)
                     tmpLog.debug(
                         'triggered new subStatus={0}'.format(newSubStatus))
                 elif tmpStat is False:
                     # fatal error
                     tmpLog.debug(
                         'fatal error to trigger with {0}'.format(tmpStr))
                     # update job
                     for fileSpec in jobSpec.outFiles:
                         if fileSpec.status != 'finished':
                             fileSpec.status = 'failed'
                     errStr = 'stage-out failed with {0}'.format(tmpStr)
                     jobSpec.set_pilot_error(PilotErrors.ERR_STAGEOUTFAILED,
                                             errStr)
                     jobSpec.trigger_propagation()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(
                         jobSpec, True, lockedBy)
                     tmpLog.debug(
                         'updated new subStatus={0}'.format(newSubStatus))
                 else:
                     # temporary error
                     tmpLog.debug(
                         'try to trigger later since {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         # get jobs to zip output
         try:
             maxFilesPerJob = harvester_config.stager.maxFilesPerJobToZip
         except Exception:
             maxFilesPerJob = None
         try:
             zipInterval = harvester_config.stager.zipInterval
         except Exception:
             zipInterval = harvester_config.stager.triggerInterval
         jobsToZip = self.dbProxy.get_jobs_for_stage_out(
             harvester_config.stager.maxJobsToZip,
             zipInterval,
             harvester_config.stager.lockInterval,
             lockedBy,
             'to_transfer',
             JobSpec.HO_hasZipOutput,
             JobSpec.HO_hasOutput,
             max_files_per_job=maxFilesPerJob)
         mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip)))
         # loop over all jobs
         for jobSpec in jobsToZip:
             tmpLog = self.make_logger(_logger,
                                       'PandaID={0}'.format(
                                           jobSpec.PandaID),
                                       method_name='run')
             try:
                 tmpLog.debug('try to zip output')
                 # configID
                 configID = jobSpec.configID
                 if not core_utils.dynamic_plugin_change():
                     configID = None
                 # get queue
                 if not self.queueConfigMapper.has_queue(
                         jobSpec.computingSite, configID):
                     tmpLog.error(
                         'queue config for {0}/{1} not found'.format(
                             jobSpec.computingSite, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     jobSpec.computingSite, configID)
                 # get plugin
                 stagerCore = self.pluginFactory.get_plugin(
                     queueConfig.stager)
                 if stagerCore is None:
                     # not found
                     tmpLog.error('plugin for {0} not found'.format(
                         jobSpec.computingSite))
                     continue
                 # lock job again
                 lockedAgain = self.dbProxy.lock_job_again(
                     jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy)
                 if not lockedAgain:
                     tmpLog.debug('skip since locked by another thread')
                     continue
                 # trigger preparation
                 tmpStat, tmpStr = stagerCore.zip_output(jobSpec)
                 # succeeded
                 if tmpStat is True:
                     # update job
                     jobSpec.all_files_zipped()
                     newSubStatus = self.dbProxy.update_job_for_stage_out(
                         jobSpec, False, lockedBy)
                     tmpLog.debug(
                         'zipped new subStatus={0}'.format(newSubStatus))
                 else:
                     # failed
                     tmpLog.debug('failed to zip with {0}'.format(tmpStr))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.stager.sleepTime):
             mainLog.debug('terminated')
             return
예제 #40
0
def submit_a_worker(data):
    workspec = data['workspec']
    template = data['template']
    log_dir = data['log_dir']
    n_core_per_node = data['n_core_per_node']
    panda_queue_name = data['panda_queue_name']
    x509_user_proxy = data['x509_user_proxy']
    ce_info_dict = data['ce_info_dict']
    batch_log_dict = data['batch_log_dict']
    special_par = data['special_par']
    workspec.reset_changed_list()
    # make logger
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='submit_a_worker')
    # make batch script
    # batchFile = make_batch_script(workspec=workspec, template=template, n_core_per_node=n_core_per_node, log_dir=log_dir,
    #                                 panda_queue_name=panda_queue_name, x509_user_proxy=x509_user_proxy,
    #                                 ce_info_dict=ce_info_dict, batch_log_dict=batch_log_dict, special_par=special_par)
    batchFile = make_batch_script(**data)
    # command
    comStr = 'condor_submit {0}'.format(batchFile)
    # submit
    tmpLog.debug('submit with {0}'.format(batchFile))
    try:
        p = subprocess.Popen(comStr.split(),
                             shell=False,
                             universal_newlines=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        # check return code
        stdOut, stdErr = p.communicate()
        retCode = p.returncode
    except:
        stdOut = ''
        stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
        retCode = 1
    tmpLog.debug('retCode={0}'.format(retCode))
    if retCode == 0:
        # extract batchID
        job_id_match = None
        for tmp_line_str in stdOut.split('\n'):
            job_id_match = re.search(
                '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$',
                tmp_line_str)
            if job_id_match:
                break
        if job_id_match is not None:
            workspec.batchID = job_id_match.group(2)
            tmpLog.debug('batchID={0}'.format(workspec.batchID))
            batch_log = _condor_macro_replace(batch_log_dict['batch_log'],
                                              ClusterId=workspec.batchID)
            batch_stdout = _condor_macro_replace(
                batch_log_dict['batch_stdout'], ClusterId=workspec.batchID)
            batch_stderr = _condor_macro_replace(
                batch_log_dict['batch_stderr'], ClusterId=workspec.batchID)
            workspec.set_log_file('batch_log', batch_log)
            workspec.set_log_file('stdout', batch_stdout)
            workspec.set_log_file('stderr', batch_stderr)
            tmpRetVal = (True, '')
        else:
            errStr = 'batchID cannot be found'
            tmpLog.error(errStr)
            tmpRetVal = (False, errStr)
    else:
        # failed
        errStr = '{0} \n {1}'.format(stdOut, stdErr)
        tmpLog.error(errStr)
        tmpRetVal = (False, errStr)
    return tmpRetVal, workspec.get_changed_attributes()
예제 #41
0
 def trigger_preparation(self, jobspec):
     # make logger
     tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID),
                               method_name='trigger_preparation')
     tmpLog.debug('start')
     # loop over all inputs
     allDone = True
     for tmpFileSpec in jobspec.inFiles:
         # local access path
         url = tmpFileSpec.url
         accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn)
         # check if already exits
         if os.path.exists(accPath):
                 continue
         # make directories if needed
         if not os.path.isdir(os.path.dirname(accPath)):
             os.makedirs(os.path.dirname(accPath))
         # get
         return_code = 1
         if url.startswith('http'):
             try:
                 tmpLog.debug('getting via http from {0} to {1}'.format(url, accPath))
                 res = requests.get(url, timeout=180, verify=False)
                 if res.status_code == 200:
                     with open(accPath, 'w') as f:
                         f.write(res.content)
                     return_code = 0
                 else:
                     errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text)
                     tmpLog.error(errMsg)
             except requests.exceptions.ReadTimeout:
                 tmpLog.error('read timeout when getting data from {0}'.format(url))
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         elif url.startswith('docker'):
             args = ['docker', 'save', '-o', accPath, url.split('://')[-1]]
             try:
                 tmpLog.debug('executing ' + ' '.join(args))
                 p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                 stdout, stderr = p.communicate()
                 return_code = p.returncode
                 if stdout is not None:
                     stdout = stdout.replace('\n', ' ')
                 if stderr is not None:
                     stderr = stderr.replace('\n', ' ')
                 tmpLog.debug("stdout: %s" % stdout)
                 tmpLog.debug("stderr: %s" % stderr)
             except Exception:
                 core_utils.dump_error_message(tmpLog)
         else:
             tmpLog.error('unsupported protocol in {0}'.format(url))
         if return_code != 0:
             allDone = False
     if allDone:
         tmpLog.debug('succeeded')
         return True, ''
     else:
         errMsg = 'failed'
         tmpLog.error(errMsg)
         # check attemptNr
         for tmpFileSpec in jobspec.inFiles:
             if tmpFileSpec.attemptNr >= self.maxAttempts:
                 errMsg = 'gave up due to max attempts'
                 tmpLog.error(errMsg)
                 return (False, errMsg)
         return None, errMsg
예제 #42
0
 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = self.make_logger(baseLogger,
                               'PandaID={0}'.format(jobspec.PandaID),
                               method_name='trigger_stage_out')
     tmpLog.debug('start')
     # loop over all files
     files = dict()
     transferIDs = dict()
     transferDatasets = dict()
     fileAttrs = jobspec.get_output_file_attributes()
     for fileSpec in jobspec.outFiles:
         # skip zipped files
         if fileSpec.zipFileID is not None:
             continue
         # skip if already processed
         if 'transferDataset' in fileSpec.fileAttributes:
             if fileSpec.fileType not in transferDatasets:
                 transferDatasets[
                     fileSpec.
                     fileType] = fileSpec.fileAttributes['transferDataset']
             if fileSpec.fileType not in transferIDs:
                 transferIDs[fileSpec.fileType] = fileSpec.fileAttributes[
                     'transferID']
             continue
         # set OS ID
         if fileSpec.fileType == ['es_output', 'zip_output']:
             fileSpec.objstoreID = self.objStoreID_ES
         # make path where file is copied for transfer
         if fileSpec.fileType != 'zip_output':
             scope = fileAttrs[fileSpec.lfn]['scope']
             datasetName = fileAttrs[fileSpec.lfn]['dataset']
         else:
             # use panda scope for zipped files
             scope = self.scopeForTmp
             datasetName = 'dummy'
         srcPath = fileSpec.path
         dstPath = mover_utils.construct_file_path(self.srcBasePath, scope,
                                                   fileSpec.lfn)
         # remove
         if os.path.exists(dstPath):
             os.remove(dstPath)
         # copy
         tmpLog.debug('copy src={srcPath} dst={dstPath}'.format(
             srcPath=srcPath, dstPath=dstPath))
         dstDir = os.path.dirname(dstPath)
         if not os.path.exists(dstDir):
             os.makedirs(dstDir)
         shutil.copyfile(srcPath, dstPath)
         # collect files
         tmpFile = dict()
         tmpFile['scope'] = scope
         tmpFile['name'] = fileSpec.lfn
         tmpFile['bytes'] = fileSpec.fsize
         if fileSpec.fileType not in files:
             files[fileSpec.fileType] = []
         files[fileSpec.fileType].append(tmpFile)
     # loop over all file types to be registered to rucio
     rucioAPI = RucioClient()
     for fileType, fileList in iteritems(files):
         # set destination RSE
         if fileType in ['es_output', 'zip_output']:
             dstRSE = self.dstRSE_ES
         elif fileType == 'output':
             dstRSE = self.dstRSE_Out
         elif fileType == 'log':
             dstRSE = self.dstRSE_Log
         else:
             errMsg = 'unsupported file type {0}'.format(fileType)
             tmpLog.error(errMsg)
             return (False, errMsg)
         # skip if destination is None
         if dstRSE is None:
             continue
         # make datasets if missing
         if fileType not in transferDatasets:
             try:
                 tmpScope = self.scopeForTmp
                 tmpDS = 'panda.harvester_stage_out.{0}'.format(
                     str(uuid.uuid4()))
                 rucioAPI.add_dataset(tmpScope,
                                      tmpDS,
                                      meta={'hidden': True},
                                      lifetime=30 * 24 * 60 * 60,
                                      files=fileList,
                                      rse=self.srcRSE)
                 transferDatasets[fileType] = tmpDS
                 # add rule
                 tmpDID = dict()
                 tmpDID['scope'] = tmpScope
                 tmpDID['name'] = tmpDS
                 tmpRet = rucioAPI.add_replication_rule([tmpDID],
                                                        1,
                                                        dstRSE,
                                                        lifetime=30 * 24 *
                                                        60 * 60)
                 tmpTransferIDs = tmpRet[0]
                 transferIDs[fileType] = tmpTransferIDs
                 tmpLog.debug('register dataset {0} with rule {1}'.format(
                     tmpDS, str(tmpTransferIDs)))
             except:
                 errMsg = core_utils.dump_error_message(tmpLog)
                 return (False, errMsg)
         else:
             # add files to existing dataset
             try:
                 tmpScope = self.scopeForTmp
                 tmpDS = transferDatasets[fileType]
                 rucioAPI.add_files_to_dataset(tmpScope, tmpDS, fileList,
                                               self.srcRSE)
                 tmpLog.debug('added files to {0}'.format(tmpDS))
             except:
                 errMsg = core_utils.dump_error_message(tmpLog)
                 return (False, errMsg)
     # set transfer datasets and rules
     for fileSpec in jobspec.outFiles:
         # skip zipped files
         if fileSpec.zipFileID is not None:
             continue
         # skip already done
         if fileSpec.status in ['finished', 'failed']:
             continue
         # skip if already processed
         if 'transferDataset' in fileSpec.fileAttributes:
             continue
         # no destination
         if fileSpec.fileType not in transferDatasets:
             fileSpec.status = 'finished'
             continue
         # set dataset
         fileSpec.fileAttributes['transferDataset'] = transferDatasets[
             fileSpec.fileType]
         # set rule
         fileSpec.fileAttributes['transferID'] = transferIDs[
             fileSpec.fileType]
         # force update
         fileSpec.force_update('fileAttributes')
     # return
     tmpLog.debug('done')
     return (True, '')
예제 #43
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         sw_main = core_utils.get_stopwatch()
         mainLog = self.make_logger(_logger,
                                    'id={0}'.format(lockedBy),
                                    method_name='run')
         # get commands to kill
         sw_getcomm = core_utils.get_stopwatch()
         mainLog.debug('try to get commands')
         comStr = CommandSpec.COM_killWorkers
         commandSpecs = self.dbProxy.get_commands_for_receiver(
             'sweeper', comStr)
         mainLog.debug('got {0} {1} commands'.format(
             len(commandSpecs), comStr))
         for commandSpec in commandSpecs:
             n_to_kill = self.dbProxy.kill_workers_by_query(
                 commandSpec.params)
             mainLog.debug('will kill {0} workers with {1}'.format(
                 n_to_kill, commandSpec.params))
         mainLog.debug('done handling commands' +
                       sw_getcomm.get_elapsed_time())
         # killing stage
         sw_kill = core_utils.get_stopwatch()
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(
             harvester_config.sweeper.maxWorkers,
             harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(
             len(workersToKill)))
         # loop over all workers
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(workersToKill):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 try:
                     sweeperCore = self.pluginFactory.get_plugin(
                         queueConfig.sweeper)
                 except Exception:
                     mainLog.error(
                         'failed to launch sweeper plugin for {0}/{1}'.
                         format(queueName, configID))
                     core_utils.dump_error_message(mainLog)
                     continue
                 sw.reset()
                 n_workers = len(workspec_list)
                 try:
                     # try bulk method
                     tmpLog = self.make_logger(_logger,
                                               'id={0}'.format(lockedBy),
                                               method_name='run')
                     tmpLog.debug('start killing')
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpLog.debug('start killing one worker')
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                             tmpLog.debug(
                                 'done killing with status={0} diag={1}'.
                                 format(tmpStat, tmpOut))
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 else:
                     # bulk method
                     n_killed = 0
                     for workspec, (tmpStat,
                                    tmpOut) in zip(workspec_list, tmpList):
                         tmpLog.debug(
                             'done killing workerID={0} with status={1} diag={2}'
                             .format(workspec.workerID, tmpStat, tmpOut))
                         if tmpStat:
                             n_killed += 1
                     tmpLog.debug('killed {0}/{1} workers'.format(
                         n_killed, n_workers))
                 mainLog.debug(
                     'done killing {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
         # cleanup stage
         sw_cleanup = core_utils.get_stopwatch()
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         try:
             keepPending = harvester_config.sweeper.keepPending
         except Exception:
             keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {
             'finished': harvester_config.sweeper.keepFinished,
             'failed': harvester_config.sweeper.keepFailed,
             'cancelled': harvester_config.sweeper.keepCancelled,
             'missed': keepMissed,
             'pending': keepPending
         }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(
             harvester_config.sweeper.maxWorkers, statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(
             len(workersForCleanup)))
         sw = core_utils.get_stopwatch()
         for queueName, configIdWorkSpecList in iteritems(
                 workersForCleanup):
             for configID, workspec_list in iteritems(configIdWorkSpecList):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(
                         queueName, configID):
                     mainLog.error(
                         'queue config for {0}/{1} not found'.format(
                             queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(
                     queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(
                     queueConfig.sweeper)
                 messenger = self.pluginFactory.get_plugin(
                     queueConfig.messenger)
                 sw.reset()
                 n_workers = len(workspec_list)
                 # make sure workers to clean up are all terminated
                 mainLog.debug(
                     'making sure workers to clean up are all terminated')
                 try:
                     # try bulk method
                     tmpList = sweeperCore.kill_workers(workspec_list)
                 except AttributeError:
                     # fall back to single-worker method
                     for workspec in workspec_list:
                         tmpLog = self.make_logger(_logger,
                                                   'workerID={0}'.format(
                                                       workspec.workerID),
                                                   method_name='run')
                         try:
                             tmpStat, tmpOut = sweeperCore.kill_worker(
                                 workspec)
                         except Exception:
                             core_utils.dump_error_message(tmpLog)
                 except Exception:
                     core_utils.dump_error_message(mainLog)
                 mainLog.debug(
                     'made sure workers to clean up are all terminated')
                 # start cleanup
                 for workspec in workspec_list:
                     tmpLog = self.make_logger(_logger,
                                               'workerID={0}'.format(
                                                   workspec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleaning up one worker')
                         # sweep worker
                         tmpStat, tmpOut = sweeperCore.sweep_worker(
                             workspec)
                         tmpLog.debug(
                             'swept_worker with status={0} diag={1}'.format(
                                 tmpStat, tmpOut))
                         tmpLog.debug('start messenger cleanup')
                         mc_tmpStat, mc_tmpOut = messenger.clean_up(
                             workspec)
                         tmpLog.debug(
                             'messenger cleaned up with status={0} diag={1}'
                             .format(mc_tmpStat, mc_tmpOut))
                         if tmpStat:
                             self.dbProxy.delete_worker(workspec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
                 mainLog.debug(
                     'done cleaning up {0} workers'.format(n_workers) +
                     sw.get_elapsed_time())
         mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
         # old-job-deletion stage
         sw_delete = core_utils.get_stopwatch()
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         # delete orphaned job info
         self.dbProxy.delete_orphaned_job_info()
         mainLog.debug('done deletion of old jobs' +
                       sw_delete.get_elapsed_time())
         # time the cycle
         mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return
예제 #44
0
 def define_num_workers(self, static_num_workers, site_name):
     tmpLog = core_utils.make_logger(_logger,
                                     'site={0}'.format(site_name),
                                     method_name='define_num_workers')
     tmpLog.debug('start')
     dyn_num_workers = copy.copy(static_num_workers)
     try:
         # get queue status
         queueStat = self.dbProxy.get_cache("panda_queues.json", None)
         if queueStat is None:
             queueStat = dict()
         else:
             queueStat = queueStat.data
         # define num of new workers
         for queueName, tmpVal in iteritems(static_num_workers):
             # set 0 to num of new workers when the queue is disabled
             if queueName in queueStat and queueStat[queueName][
                     'status'] in ['offline']:
                 dyn_num_workers[queueName]['nNewWorkers'] = 0
                 retMsg = 'set nNewWorkers=0 since status={0}'.format(
                     queueStat[queueName]['status'])
                 tmpLog.debug(retMsg)
                 continue
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # get throttler
             if queueName not in self.throttlerMap:
                 if hasattr(queueConfig, 'throttler'):
                     throttler = self.pluginFactory.get_plugin(
                         queueConfig.throttler)
                 else:
                     throttler = None
                 self.throttlerMap[queueName] = throttler
             # check throttler
             throttler = self.throttlerMap[queueName]
             if throttler is not None:
                 toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                 if toThrottle:
                     dyn_num_workers[queueName]['nNewWorkers'] = 0
                     retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(
                         throttler.__class__.__name__, tmpMsg)
                     tmpLog.debug(retMsg)
                     continue
             # check stats
             nQueue = tmpVal['nQueue']
             nReady = tmpVal['nReady']
             nRunning = tmpVal['nRunning']
             nQueueLimit = queueConfig.nQueueLimitWorker
             maxWorkers = queueConfig.maxWorkers
             if queueConfig.runMode == 'slave':
                 nNewWorkersDef = tmpVal['nNewWorkers']
                 if nNewWorkersDef == 0:
                     dyn_num_workers[queueName]['nNewWorkers'] = 0
                     retMsg = 'set nNewWorkers=0 by panda in slave mode'
                     tmpLog.debug(retMsg)
                     continue
             else:
                 nNewWorkersDef = None
             # define num of new workers based on static site config
             nNewWorkers = 0
             if nQueueLimit > 0 and nQueue >= nQueueLimit:
                 # enough queued workers
                 retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format(
                     nQueue, nQueueLimit)
                 tmpLog.debug(retMsg)
                 pass
             elif maxWorkers > 0 and (nQueue + nReady +
                                      nRunning) >= maxWorkers:
                 # enough workers in the system
                 retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(
                     nQueue, nReady, nRunning)
                 retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                 tmpLog.debug(retMsg)
                 pass
             else:
                 # get max number of queued workers
                 maxQueuedWorkers = 0
                 if nQueueLimit > 0:
                     maxQueuedWorkers = nQueueLimit
                 if maxQueuedWorkers == 0:
                     if nNewWorkersDef is not None:
                         # slave mode
                         maxQueuedWorkers = nNewWorkersDef + nQueue
                     else:
                         # use default value
                         maxQueuedWorkers = 1
                 # new workers
                 nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                 if maxWorkers > 0:
                     nNewWorkers = min(
                         nNewWorkers,
                         max(maxWorkers - nQueue - nReady - nRunning, 0))
             if queueConfig.maxNewWorkersPerCycle > 0:
                 nNewWorkers = min(nNewWorkers,
                                   queueConfig.maxNewWorkersPerCycle)
             dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers
         # dump
         tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
         return dyn_num_workers
     except:
         # dump error
         errMsg = core_utils.dump_error_message(tmpLog)
         return None
예제 #45
0
 def ssh_make_one_zip(self, arg_dict):
     try:
         zipPath = arg_dict['zipPath']
         lfn = os.path.basename(zipPath)
         self.zip_tmp_log.debug(
             '{0} start zipPath={1} with {2} files'.format(
                 lfn, zipPath, len(arg_dict['associatedFiles'])))
         in_data = '\\n'.join(
             ['{0}'.format(path) for path in arg_dict['associatedFiles']])
         com0 = (
             'ssh '
             '-o StrictHostKeyChecking=no '
             '-i {sshkey} '
             '{userhost} '
             '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "'
         ).format(
             sshkey=self.sshkey,
             userhost=self.userhost,
             fileop_script=self.fileop_script,
             suffix='_tar-name.tmp',
             dir=os.path.dirname(zipPath),
             data=in_data,
         )
         # execute
         p0 = subprocess.Popen(com0,
                               shell=True,
                               close_fds=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
         stdOut, stdErr = p0.communicate()
         retCode = p0.returncode
         if retCode != 0:
             msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format(
                 stdOut, stdErr)
             tmp_log.error(msgStr)
             return False, 'failed to zip with {0}'.format(msgStr)
         stdOut_str = stdOut if (isinstance(stdOut, str)
                                 or stdOut is None) else stdOut.decode()
         tmpargfile_name = stdOut_str.strip('\n')
         del p0, stdOut, stdErr
         # tmp zip file names
         tmpZipPath = zipPath + '.' + str(uuid.uuid4())
         com1 = (
             'ssh '
             '-o StrictHostKeyChecking=no '
             '-i {sshkey} '
             '{userhost} '
             '"test -f {tmpZipPath} || tar -cf {tmpZipPath} -T {arg_file} --transform \'s;.*/;;\' "'
         ).format(
             sshkey=self.sshkey,
             userhost=self.userhost,
             tmpZipPath=tmpZipPath,
             arg_file=tmpargfile_name,
         )
         # execute
         p1 = subprocess.Popen(com1,
                               shell=True,
                               close_fds=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
         stdOut, stdErr = p1.communicate()
         retCode = p1.returncode
         if retCode != 0:
             msgStr = 'failed to make zip for {0} with {1}:{2}'.format(
                 lfn, stdOut, stdErr)
             self.zip_tmp_log.error(msgStr)
             return None, msgStr, {}
         del p1, stdOut, stdErr
         # delete tmpargfile
         com1a = ('ssh '
                  '-o StrictHostKeyChecking=no '
                  '-i {sshkey} '
                  '{userhost} '
                  '"{fileop_script} remove_file {file_path} "').format(
                      sshkey=self.sshkey,
                      userhost=self.userhost,
                      fileop_script=self.fileop_script,
                      file_path=tmpargfile_name,
                  )
         # execute
         p1a = subprocess.Popen(com1a,
                                shell=True,
                                close_fds=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
         stdOut, stdErr = p1a.communicate()
         retCode = p1a.returncode
         if retCode != 0:
             msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format(
                 stdOut, stdErr)
             tmp_log.error(msgStr)
         del p1a, stdOut, stdErr
         gc.collect()
         # avoid overwriting
         lockName = 'zip.lock.{0}'.format(lfn)
         lockInterval = 60
         tmpStat = False
         # get lock
         for i in range(lockInterval):
             tmpStat = self.dbInterface.get_object_lock(
                 lockName, lock_interval=lockInterval)
             if tmpStat:
                 break
             time.sleep(1)
         # failed to lock
         if not tmpStat:
             msgStr = 'failed to lock for {0}'.format(lfn)
             self.zip_tmp_log.error(msgStr)
             return None, msgStr, {}
         # rename to be zipPath
         com2 = ('ssh '
                 '-o StrictHostKeyChecking=no '
                 '-i {sshkey} '
                 '{userhost} '
                 '"test -f {zipPath} || mv {tmpZipPath} {zipPath}"').format(
                     sshkey=self.sshkey,
                     userhost=self.userhost,
                     zipPath=zipPath,
                     tmpZipPath=tmpZipPath,
                 )
         p2 = subprocess.Popen(com2,
                               shell=True,
                               close_fds=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
         p2.communicate()
         del p2
         gc.collect()
         # release lock
         self.dbInterface.release_object_lock(lockName)
         # make return
         fileInfo = dict()
         fileInfo['path'] = zipPath
         # get size
         com3 = ('ssh '
                 '-o StrictHostKeyChecking=no '
                 '-i {sshkey} '
                 '{userhost} '
                 '"stat -c %s {zipPath}"').format(
                     sshkey=self.sshkey,
                     userhost=self.userhost,
                     zipPath=zipPath,
                 )
         p3 = subprocess.Popen(com3,
                               shell=True,
                               close_fds=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
         stdOut, stdErr = p3.communicate()
         retCode = p3.returncode
         if retCode != 0:
             msgStr = 'failed to get file size of {0} with {1}:{2}'.format(
                 zipPath, stdOut, stdErr)
             self.zip_tmp_log.error(msgStr)
             return None, msgStr, {}
         else:
             stdOut_str = stdOut if (isinstance(stdOut, str) or
                                     stdOut is None) else stdOut.decode()
             file_size = int(stdOut_str.strip('\n'))
             fileInfo['fsize'] = file_size
         del p3, stdOut, stdErr
         gc.collect()
         # get checksum
         com4 = ('ssh '
                 '-o StrictHostKeyChecking=no '
                 '-i {sshkey} '
                 '{userhost} '
                 '"{fileop_script} adler32 {zipPath}"').format(
                     sshkey=self.sshkey,
                     userhost=self.userhost,
                     fileop_script=self.fileop_script,
                     zipPath=zipPath,
                 )
         p4 = subprocess.Popen(com4,
                               shell=True,
                               close_fds=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
         stdOut, stdErr = p4.communicate()
         retCode = p4.returncode
         if retCode != 0:
             msgStr = 'failed to get file adler32 of {0} with {1}:{2}'.format(
                 zipPath, stdOut, stdErr)
             self.zip_tmp_log.error(msgStr)
             return None, msgStr, {}
         else:
             stdOut_str = stdOut if (isinstance(stdOut, str) or
                                     stdOut is None) else stdOut.decode()
             file_chksum = stdOut_str.strip('\n')
             fileInfo['chksum'] = file_chksum
         del p4, stdOut, stdErr
         gc.collect()
     except Exception:
         errMsg = core_utils.dump_error_message(self.zip_tmp_log)
         return False, 'failed to zip with {0}'.format(errMsg)
     self.zip_tmp_log.debug('{0} done'.format(lfn))
     return True, '', fileInfo
예제 #46
0
 def _check_assfile_existence(fileSpec):
     in_data = '\\n'.join([
         '{0}'.format(assFileSpec.path)
         for assFileSpec in fileSpec.associatedFiles
     ])
     com1 = (
         'ssh '
         '-o StrictHostKeyChecking=no '
         '-i {sshkey} '
         '{userhost} '
         '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "'
     ).format(
         sshkey=self.sshkey,
         userhost=self.userhost,
         fileop_script=self.fileop_script,
         suffix='_check-exist.tmp',
         dir=os.path.dirname(
             next(iter(fileSpec.associatedFiles)).path),
         data=in_data,
     )
     # execute
     p1 = subprocess.Popen(com1,
                           shell=True,
                           close_fds=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
     stdOut, stdErr = p1.communicate()
     retCode = p1.returncode
     if retCode != 0:
         msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format(
             stdOut, stdErr)
         tmp_log.error(msgStr)
         return False, 'failed to zip with {0}'.format(msgStr)
     stdOut_str = stdOut if (isinstance(stdOut, str) or
                             stdOut is None) else stdOut.decode()
     tmpargfile_name = stdOut_str.strip('\n')
     del p1, stdOut, stdErr
     # record set
     existence_set = set()
     # make command
     com2 = (
         'ssh '
         '-o StrictHostKeyChecking=no '
         '-i {sshkey} '
         '{userhost} '
         '"cat {arg_file} | xargs -I%% sh -c \' test -f %% && echo T || echo F \' " '
     ).format(
         sshkey=self.sshkey,
         userhost=self.userhost,
         arg_file=tmpargfile_name,
     )
     # execute
     p2 = subprocess.Popen(com2,
                           shell=True,
                           close_fds=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
     stdOut, stdErr = p2.communicate()
     retCode = p2.returncode
     if retCode != 0:
         msgStr = 'failed to existence of associate files with {0}:{1}'.format(
             stdOut, stdErr)
         tmp_log.error(msgStr)
     else:
         try:
             stdOut_str = stdOut if (
                 isinstance(stdOut, str)
                 or stdOut is None) else stdOut.decode()
             ret_list = stdOut_str.strip('\n').split('\n')
             if len(fileSpec.associatedFiles) == len(ret_list):
                 for (assFileSpec,
                      retVal) in zip(fileSpec.associatedFiles,
                                     ret_list):
                     if retVal == 'T':
                         existence_set.add(assFileSpec.path)
             else:
                 msgStr = 'returned number of files inconsistent! Skipped...'
                 tmp_log.error(msgStr)
         except Exception:
             core_utils.dump_error_message(tmp_log)
     del p2, stdOut, stdErr, com2
     # delete tmpargfile
     com3 = ('ssh '
             '-o StrictHostKeyChecking=no '
             '-i {sshkey} '
             '{userhost} '
             '"{fileop_script} remove_file {file_path} "').format(
                 sshkey=self.sshkey,
                 userhost=self.userhost,
                 fileop_script=self.fileop_script,
                 file_path=tmpargfile_name,
             )
     # execute
     p3 = subprocess.Popen(com3,
                           shell=True,
                           close_fds=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
     stdOut, stdErr = p3.communicate()
     retCode = p3.returncode
     if retCode != 0:
         msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format(
             stdOut, stdErr)
         tmp_log.error(msgStr)
     del p3, stdOut, stdErr
     gc.collect()
     return existence_set
예제 #47
0
 def update_jobs(self, jobspec_list, id):
     sw = core_utils.get_stopwatch()
     tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs')
     tmpLogG.debug('update {0} jobs'.format(len(jobspec_list)))
     retList = []
     # update events
     for jobSpec in jobspec_list:
         eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000)
         if eventRanges != []:
             tmpLogG.debug('update {0} events for PandaID={1}'.format(len(eventSpecs), jobSpec.PandaID))
             tmpRet = self.update_event_ranges(eventRanges, tmpLogG)
             if tmpRet['StatusCode'] == 0:
                 for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']):
                     if retVal in [True, False] and eventSpec.is_final_status():
                         eventSpec.subStatus = 'done'
     # update jobs in bulk
     nLookup = 100
     iLookup = 0
     while iLookup < len(jobspec_list):
         dataList = []
         jobSpecSubList = jobspec_list[iLookup:iLookup+nLookup]
         for jobSpec in jobSpecSubList:
             data = jobSpec.get_job_attributes_for_panda()
             data['jobId'] = jobSpec.PandaID
             data['siteName'] = jobSpec.computingSite
             data['state'] = jobSpec.get_status()
             data['attemptNr'] = jobSpec.attemptNr
             data['jobSubStatus'] = jobSpec.subStatus
             # change cancelled to failed to be accepted by panda server
             if data['state'] in ['cancelled', 'missed']:
                 if jobSpec.is_pilot_closed():
                     data['jobSubStatus'] = 'pilot_closed'
                 else:
                     data['jobSubStatus'] = data['state']
                 data['state'] = 'failed'
             if jobSpec.startTime is not None and 'startTime' not in data:
                 data['startTime'] = jobSpec.startTime.strftime('%Y-%m-%d %H:%M:%S')
             if jobSpec.endTime is not None and 'endTime' not in data:
                 data['endTime'] = jobSpec.endTime.strftime('%Y-%m-%d %H:%M:%S')
             if 'coreCount' not in data and jobSpec.nCore is not None:
                 data['coreCount'] = jobSpec.nCore
             if jobSpec.is_final_status() and jobSpec.status == jobSpec.get_status():
                 if jobSpec.metaData is not None:
                     data['metaData'] = json.dumps(jobSpec.metaData)
                 if jobSpec.outputFilesToReport is not None:
                     data['xml'] = jobSpec.outputFilesToReport
             dataList.append(data)
         harvester_id = harvester_config.master.harvester_id
         tmpData = {'jobList': json.dumps(dataList), 'harvester_id': harvester_id}
         tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData)
         retMaps = None
         errStr = ''
         if tmpStat is False:
             errStr = core_utils.dump_error_message(tmpLogG, tmpRes)
         else:
             try:
                 tmpStat, retMaps = tmpRes.json()
                 if tmpStat is False:
                     tmpLogG.error('updateJobsInBulk failed with {0}'.format(retMaps))
                     retMaps = None
             except Exception:
                 errStr = core_utils.dump_error_message(tmpLogG)
         if retMaps is None:
             retMap = {}
             retMap['content'] = {}
             retMap['content']['StatusCode'] = 999
             retMap['content']['ErrorDiag'] = errStr
             retMaps = [json.dumps(retMap)] * len(jobSpecSubList)
         for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList):
             tmpLog = self.make_logger('id={0} PandaID={1}'.format(id, jobSpec.PandaID),
                                       method_name='update_jobs')
             try:
                 retMap = json.loads(retMap['content'])
             except Exception:
                 errStr = 'falied to load json'
                 retMap = {}
                 retMap['StatusCode'] = 999
                 retMap['ErrorDiag'] = errStr
             tmpLog.debug('data={0}'.format(str(data)))
             tmpLog.debug('done with {0}'.format(str(retMap)))
             retList.append(retMap)
         iLookup += nLookup
     tmpLogG.debug('done' + sw.get_elapsed_time())
     return retList
예제 #48
0
 def do_POST(self):
     # logger
     if self.tmpLog is None:
         self.tmpLog = core_utils.make_logger(_logger)
     toSkip = False
     form = None
     methodName = None
     dataStr = None
     message = ''
     # parse the form data posted
     try:
         form = self.get_form()
     except Exception:
         message = 'corrupted json'
         toSkip = True
     # check parameters
     if not toSkip:
         toSkip = True
         # method is not set
         if 'methodName' not in form:
             message = 'methodName is not given'
             self.send_response(400)
         elif 'workerID' not in form:
             message = 'workerID is not given'
             self.send_response(400)
         elif 'data' not in form:
             message = 'data is not given'
             self.send_response(400)
         else:
             toSkip = False
     # get worker
     if not toSkip:
         try:
             workerID = form['workerID']
             workSpec = self.dbProxy.get_worker_with_id(workerID)
             if workSpec is None:
                 message = 'workerID={0} not found in DB'.format(workerID)
                 self.send_response(400)
             else:
                 # chose file and operation for each action
                 methodName = form['methodName']
                 opType = None
                 filePath = ''
                 if methodName == 'requestJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonJobRequestFileName)
                     opType = 'w'
                 elif methodName == 'getJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jobSpecFileName)
                     opType = 'r'
                 elif methodName == 'requestEventRanges':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonEventsRequestFileName)
                     opType = 'w'
                 elif methodName == 'getEventRanges':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonEventsFeedFileName)
                     opType = 'r'
                 elif methodName == 'updateJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonAttrsFileName)
                     opType = 'w'
                 elif methodName == 'uploadJobReport':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonJobReport)
                     opType = 'w'
                 elif methodName == 'uploadEventOutputDump':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonOutputsFileName)
                     opType = 'w'
                 elif methodName == 'setPandaIDs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.pandaIDsFile)
                     opType = 'w'
                 elif methodName == 'killWorker':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.killWorkerFile)
                     opType = 'w'
                 elif methodName == 'heartbeat':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.heartbeatFile)
                     opType = 'w'
                 else:
                     self.send_response(501)
                     message = 'method not implemented'
                     toSkip = True
                 # take action
                 if not toSkip:
                     # write actions
                     if opType == 'w':
                         # check if file exists. Methods such as heartbeat however need to overwrite the file
                         if os.path.exists(filePath) and methodName not in [
                                 'heartbeat'
                         ]:
                             message = 'previous request is not yet processed'
                             self.send_response(503)
                         else:
                             with open(filePath, 'w') as fileHandle:
                                 json.dump(form['data'], fileHandle)
                                 message = 'OK'
                                 self.send_response(200)
                     else:
                         # read actions
                         if os.path.exists(filePath):
                             with open(filePath) as fileHandle:
                                 try:
                                     _message = json.load(fileHandle)
                                     message = json.dumps(_message)
                                     self.send_header(
                                         'Content-Type', 'application/json')
                                 except JSONDecodeError:
                                     _f_qs = open(filePath).read()
                                     # _message = dict(parse_qsl(_f_qs, keep_blank_values=True))
                                     message = _f_qs
                                     self.send_header(
                                         'Content-Type', 'text/plain')
                                 self.send_response(200)
                         else:
                             message = 'previous request is not yet processed'
                             self.send_response(503)
         except Exception:
             self.send_response(500)
             message = core_utils.dump_error_message(_logger)
     if harvester_config.frontend.verbose:
         self.tmpLog.debug('ip={3} - method={0} json={1} msg={2}'.format(
             methodName, dataStr, message, self.client_address[0]))
     # set the response
     self.do_postprocessing(message)
     return
예제 #49
0
    def check_stage_out_status(self, jobspec):
        # make logger
        tmpLog = self.make_logger(_logger,
                                  'PandaID={0}'.format(jobspec.PandaID),
                                  method_name='check_stage_out_status')
        tmpLog.debug('executing base check_stage_out_status')
        tmpStat, tmpMsg = GlobusBulkStager.check_stage_out_status(
            self, jobspec)
        tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg))
        if tmpStat is not True:
            return tmpStat, tmpMsg
        # get transfer groups
        groups = jobspec.get_groups_of_output_files()
        if len(groups) == 0:
            return tmpStat, tmpMsg
        # get the queueConfig and corresponding objStoreID_ES
        queueConfigMapper = QueueConfigMapper()
        queueConfig = queueConfigMapper.get_queue(jobspec.computingSite)
        # write to debug log queueConfig.stager
        tmpLog.debug(
            'jobspec.computingSite - {0} queueConfig.stager {1}'.format(
                jobspec.computingSite, queueConfig.stager))
        # check queueConfig stager section to see if srcRSE is set
        if 'srcRSE' in queueConfig.stager:
            srcRSE = queueConfig.stager['srcRSE']
        else:
            tmpLog.debug(
                'Warning srcRSE not defined in stager portion of queue config file'
            )
        # get destination endpoint
        nucleus = jobspec.jobParams['nucleus']
        agis = self.dbInterface.get_cache('panda_queues.json').data
        dstRSE = [
            agis[x]["astorages"]['pr'][0] for x in agis
            if agis[x]["atlas_site"] == nucleus
        ][0]
        # if debugging log source and destination RSEs
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE, dstRSE))
        # test that srcRSE and dstRSE are defined
        tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE, dstRSE))
        errStr = ''
        if srcRSE is None:
            errStr = 'Source RSE is not defined '
        if dstRSE is None:
            errStr = errStr + ' Desitination RSE is not defined'
        if (srcRSE is None) or (dstRSE is None):
            tmpLog.error(errStr)
            return None, errStr
        # check queueConfig stager section to see if jobtype is set
        if 'jobtype' in queueConfig.stager:
            if queueConfig.stager['jobtype'] == "Yoda":
                self.Yodajob = True
        # set the location of the files in fileSpec.objstoreID
        # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json
        ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data
        self.objstoreID = ddm[dstRSE]['id']
        if self.Yodajob:
            self.pathConvention = int(queueConfig.stager['pathConvention'])
            tmpLog.debug(
                'Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'
                .format(jobspec.PandaID, self.objstoreID, self.pathConvention))
        else:
            self.pathConvention = None
            tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(
                jobspec.PandaID, self.objstoreID))
        # set the location of the files in fileSpec.objstoreID
        self.set_FileSpec_objstoreID(jobspec, self.objstoreID,
                                     self.pathConvention)
        # create the Rucio Client
        try:
            # register dataset
            rucioAPI = RucioClient()
        except Exception:
            core_utils.dump_error_message(tmpLog)
            # treat as a temporary error
            tmpStat = None
            tmpMsg = 'failed to add a rule for {0}:{1}'.format(
                datasetScope, datasetName)
            return tmpStat, tmpMsg
        # loop over all transfers
        tmpStat = True
        tmpMsg = ''
        for transferID in groups:
            if transferID is None:
                continue
            datasetName = 'panda.harvester.{0}.{1}'.format(
                jobspec.PandaID, transferID)
            datasetScope = 'transient'
            # lock
            have_db_lock = self.dbInterface.get_object_lock(transferID,
                                                            lock_interval=120)
            if not have_db_lock:
                msgStr = 'escape since {0} is locked by another thread'.format(
                    transferID)
                tmpLog.debug(msgStr)
                return None, msgStr
            # get transfer status
            groupStatus = self.dbInterface.get_file_group_status(transferID)
            if 'hopped' in groupStatus:
                # already succeeded
                pass
            elif 'failed' in groupStatus:
                # transfer failure
                tmpStat = False
                tmpMsg = 'rucio rule for {0}:{1} already failed'.format(
                    datasetScope, datasetName)
            elif 'hopping' in groupStatus:
                # check rucio rule
                ruleStatus = 'FAILED'
                try:
                    tmpLog.debug('check state for {0}:{1}'.format(
                        datasetScope, datasetName))
                    for ruleInfo in rucioAPI.list_did_rules(
                            datasetScope, datasetName):
                        if ruleInfo['rse_expression'] != dstRSE:
                            continue
                        ruleStatus = ruleInfo['state']
                        tmpLog.debug('got state={0}'.format(ruleStatus))
                        if ruleStatus == 'OK':
                            break
                except DataIdentifierNotFound:
                    tmpLog.error('dataset not found')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    ruleStatus = None
                if ruleStatus in ['FAILED', 'CANCELED']:
                    # transfer failure
                    tmpStat = False
                    tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(
                        datasetScope, datasetName, ruleStatus)
                    # update file group status
                    self.dbInterface.update_file_group_status(
                        transferID, 'failed')
                elif ruleStatus == 'OK':
                    # update successful file group status
                    self.dbInterface.update_file_group_status(
                        transferID, 'hopped')
                else:
                    # replicating or temporary error
                    tmpStat = None
                    tmpMsg = 'replicating or temporary error for {0}:{1}'.format(
                        datasetScope, datasetName)
            else:
                # make rucio rule
                fileSpecs = self.dbInterface.get_files_with_group_id(
                    transferID)
                fileList = []
                for fileSpec in fileSpecs:
                    tmpFile = dict()
                    tmpFile['scope'] = datasetScope
                    tmpFile['name'] = fileSpec.lfn
                    tmpFile['bytes'] = fileSpec.fsize
                    tmpFile['adler32'] = fileSpec.chksum
                    if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                        tmpFile['meta'] = {
                            'guid': fileSpec.fileAttributes['guid']
                        }
                    else:
                        tmpLog.debug(
                            'File - {0} does not have a guid value'.format(
                                fileSpec.lfn))
                    tmpLog.debug('Adding file {0} to fileList'.format(
                        fileSpec.lfn))
                    fileList.append(tmpFile)
                    # get source RSE
                    if srcRSE is None and fileSpec.objstoreID is not None:
                        ddm = self.dbInterface.get_cache(
                            'agis_ddmendpoints.json').data
                        srcRSE = [
                            x for x in ddm
                            if ddm[x]["id"] == fileSpec.objstoreID
                        ][0]
                try:
                    # register dataset
                    tmpLog.debug(
                        'register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}'
                        .format(datasetScope, datasetName, srcRSE,
                                (30 * 24 * 60 * 60)))
                    try:
                        rucioAPI.add_dataset(datasetScope,
                                             datasetName,
                                             meta={'hidden': True},
                                             lifetime=30 * 24 * 60 * 60,
                                             rse=srcRSE)
                    except DataIdentifierAlreadyExists:
                        # ignore even if the dataset already exists
                        pass
                    except Exception:
                        errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(
                            datasetScope, datasetName, srcRSE)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.error(errMsg)
                        raise
                        # return None,errMsg
                    # add files to dataset
                    #  add 500 files at a time
                    numfiles = len(fileList)
                    maxfiles = 500
                    numslices = numfiles / maxfiles
                    if (numfiles % maxfiles) > 0:
                        numslices = numslices + 1
                    start = 0
                    for i in range(numslices):
                        try:
                            stop = start + maxfiles
                            if stop > numfiles:
                                stop = numfiles

                            rucioAPI.add_files_to_datasets(
                                [{
                                    'scope': datasetScope,
                                    'name': datasetName,
                                    'dids': fileList[start:stop],
                                    'rse': srcRSE
                                }],
                                ignore_duplicate=True)
                            start = stop
                        except FileAlreadyExists:
                            # ignore if files already exist
                            pass
                        except Exception:
                            errMsg = 'Could not add files to DS - {0}:{1}  rse - {2} files - {3}'.format(
                                datasetScope, datasetName, srcRSE, fileList)
                            core_utils.dump_error_message(tmpLog)
                            tmpLog.error(errMsg)
                            return None, errMsg
                    # add rule
                    try:
                        tmpDID = dict()
                        tmpDID['scope'] = datasetScope
                        tmpDID['name'] = datasetName
                        tmpRet = rucioAPI.add_replication_rule([tmpDID],
                                                               1,
                                                               dstRSE,
                                                               lifetime=30 *
                                                               24 * 60 * 60)
                        ruleIDs = tmpRet[0]
                        tmpLog.debug(
                            'registered dataset {0}:{1} with rule {2}'.format(
                                datasetScope, datasetName, str(ruleIDs)))
                    except DuplicateRule:
                        # ignore duplicated rule
                        tmpLog.debug('rule is already available')
                    except Exception:
                        errMsg = 'Error creating rule for dataset {0}:{1}'.format(
                            datasetScope, datasetName)
                        core_utils.dump_error_message(tmpLog)
                        tmpLog.debug(errMsg)
                        #raise
                        return None, errMsg
                    # update file group status
                    self.dbInterface.update_file_group_status(
                        transferID, 'hopping')
                except Exception:
                    core_utils.dump_error_message(tmpLog)
                    # treat as a temporary error
                    tmpStat = None
                    tmpMsg = 'failed to add a rule for {0}:{1}'.format(
                        datasetScope, datasetName)
            # release lock
            self.dbInterface.release_object_lock(transferID)
            # escape if already failed
            if tmpStat is False:
                break
        # all done
        if tmpStat is True:
            self.set_FileSpec_status(jobspec, 'finished')
        tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg))
        return tmpStat, tmpMsg
예제 #50
0
 def run(self):
     lockedBy = 'sweeper-{0}'.format(self.get_pid())
     while True:
         mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
         mainLog.debug('try to get workers to kill')
         # get workers to kill
         workersToKill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers,
                                                          harvester_config.sweeper.checkInterval)
         mainLog.debug('got {0} queues to kill workers'.format(len(workersToKill)))
         # loop over all workers
         for queueName, configIdWorkSpecs in iteritems(workersToKill):
             for configID, workSpecs in iteritems(configIdWorkSpecs):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(queueName, configID):
                     mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                 for workSpec in workSpecs:
                     tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start killing')
                         tmpStat, tmpOut = sweeperCore.kill_worker(workSpec)
                         tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
         mainLog.debug('done kill')
         # timeout for missed
         try:
             keepMissed = harvester_config.sweeper.keepMissed
         except Exception:
             keepMissed = 24
         keepPending = 24
         # get workers for cleanup
         statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished,
                             'failed': harvester_config.sweeper.keepFailed,
                             'cancelled': harvester_config.sweeper.keepCancelled,
                             'missed': keepMissed,
                             'pending': keepPending
                             }
         workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers,
                                                                  statusTimeoutMap)
         mainLog.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup)))
         for queueName, configIdWorkSpecs in iteritems(workersForCleanup):
             for configID, workSpecs in iteritems(configIdWorkSpecs):
                 # get sweeper
                 if not self.queueConfigMapper.has_queue(queueName, configID):
                     mainLog.error('queue config for {0}/{1} not found'.format(queueName, configID))
                     continue
                 queueConfig = self.queueConfigMapper.get_queue(queueName, configID)
                 sweeperCore = self.pluginFactory.get_plugin(queueConfig.sweeper)
                 for workSpec in workSpecs:
                     tmpLog = self.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID),
                                               method_name='run')
                     try:
                         tmpLog.debug('start cleanup')
                         tmpStat, tmpOut = sweeperCore.sweep_worker(workSpec)
                         tmpLog.debug('done with status={0} diag={1}'.format(tmpStat, tmpOut))
                         if tmpStat:
                             # delete from DB
                             self.dbProxy.delete_worker(workSpec.workerID)
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
         # delete old jobs
         mainLog.debug('delete old jobs')
         jobTimeout = max(statusTimeoutMap.values()) + 1
         self.dbProxy.delete_old_jobs(jobTimeout)
         mainLog.debug('done cleanup')
         # check if being terminated
         if self.terminated(harvester_config.sweeper.sleepTime):
             mainLog.debug('terminated')
             return