def recoverFINISHEDDIRTY(s, context, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName, context=context) # This set of files has transferred successfully. Move it to DONE directory # print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." # context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] missFiles = [] kor = 0 for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue reason = fileInfo["reason"] if "Probably stalled" in reason: failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) elif "globus_ftp_control_local_pasv failed" in reason: failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) elif "500 No such file or directory" in reason: print fServer, fileInfo["source_surl"], reason print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID missFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) else: kor = kor + 1 if kor < 2: print ftsFileName, fileInfo["source_surl"], fileInfo["reason"][:50] failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles cleanUpTransfer(failedFiles, ftsFileName) writeTransfer(failedFiles, "TODO/", "D", ftsFileName) writeTransfer(missFiles, "DONE/Bad/", "M", ftsFileName)
def submit(proxy, toTrans, source, destination): # prepare rest job with 200 files per job transfers = [] for files in chunks(toTrans, 200): c = pycurl.Curl() # create destination and source pfns for job for lfn in files: print(lfn) transfers.append( fts3.new_transfer(apply_tfc_to_lfn(source, lfn, c), apply_tfc_to_lfn(destination, lfn, c))) c.close() # Submit fts job context = fts3.Context('https://fts3.cern.ch:8446', proxy, proxy, verify=True) print(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) job = fts3.new_job(transfers) #print("Monitor link: https://fts3.cern.ch:8449/fts3/ftsmon/#/job/"+fts3.submit(context, job)) jobid = fts3.submit(context, job) #for file in (fts3.get_job_status(context, jobid, list_files=True))["files"]: for key, value in (fts3.get_job_status(context, jobid, list_files=True)).iteritems(): print key
def getNewStatus(s, f, fid=""): if len(fid) < 3: (fid, fstat, fIter, fServer) = getStatusForJob(s, f) if fid == "-1": print "File ", f, "not submitted to FTS?" return "Unknown-notsubmitted", -1, 0, "-1" context = fts3.Context(fServer) ftsStat = fts3.get_job_status(context, fid) return ftsStat["job_state"], fid, ftsStat, fServer
def monitorFTS3( self, full = False ): if not self.FTSGUID: return S_ERROR( "FTSGUID not set, FTS job not submitted?" ) jobStatusDict = None try: context = fts3.Context( endpoint = self.FTSServer ) jobStatusDict = fts3.get_job_status( context, self.FTSGUID, list_files = True ) except Exception, e: return S_ERROR( "Error getting the job status %s" % e )
def monitorFTS3(self, full=False): if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") jobStatusDict = None try: context = fts3.Context(endpoint=self.FTSServer) jobStatusDict = fts3.get_job_status(context, self.FTSGUID, list_files=True) except Exception, e: return S_ERROR("Error getting the job status %s" % e)
def getNewStatus(s, f, fid="", context=0): if len(fid) < 3 : (fid, fstat, fIter, fServer) = getStatusForJob(s, f) if fid == "-1": return "Unknown-notsubmitted", -1, 0, "-1" if context == 0: context = fts3.Context(fServer) try: ftsStat = fts3.get_job_status(context, fid) return ftsStat["job_state"], fid, ftsStat, fServer except: print "File ", f, "unknown to FTS?" return "Unknow", -1, 0, "-1"
def _fts_wait_jobs(context, job_map_list, sleep_time=10): """ """ finished_jobs = [] while len(finished_jobs) < len(job_map_list): for job_map in job_map_list: try: job_id = job_map['job_id'] if job_id in finished_jobs: continue response = fts3.get_job_status(context, job_id, list_files=True) if response['http_status'] == "200 Ok": if response["job_finished"]: finished_jobs.append(job_id) _flush_logging_msg( 'Job with id {} finished with job_state:{} | {}/{}'. format(job_id, response['job_state'], len(finished_jobs), len(job_map_list))) if response['job_state'] == "FINISHED": _gfal_rm_files(job_map['files_to_purge'], job_map['directory']) _flush_logging_msg( "Removing testing files from destination") else: filenames = [] for file_map in response['files']: if file_map['file_state'] == 'FINISHED': filenames.append( file_map['dest_surl'].split( "/dest/")[1]) _flush_logging_msg( "Removing testing files from destination") _gfal_rm_files(filenames, job_map['directory']) else: _flush_logging_msg('Server http status: {}'.format( response['http_status'])) finished_jobs.append(job_id) except Exception as e: _flush_logging_msg("Polling failed:{}, response:{}".format( e, response)) finished_jobs.append(job_id) _flush_logging_msg( "Sleeping for {} seconds before commencing polling again..".format( sleep_time)) time.sleep(sleep_time) return None
def recoverFINISHEDDIRTY(s, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName) # This set of files has transferred successfully. Move it to DONE directory print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles for fF in failedFiles: print fF cleanUpTransfer(failedFiles, ftsFileName) retryFailedTransfer(failedFiles, ftsFileName)
def recoverFINISHEDDIRTY(s, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName) if status == "Unknown": # Try again ... shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName) return # This set of files has transferred successfully. Move it to DONE directory print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." if ftsJID == -1: print "Probably in old sqlite dB. Could not check - retry" shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName) return context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles if len(failedFiles) < 1: return cleanUpTransfer(failedFiles, ftsFileName) retryFailedTransfer(failedFiles, ftsFileName)
(options, args) = opts.parse_args() # get the jobID as the last parameter if len(args) < 2 and not options.uniq: opts.print_usage() sys.exit(1) job_id = args[0] reasons = [] if not options.uniq: for r in args[1:]: reasons.append(convert_reason_to_regexp(r)) # pprint(reasons) context = fts3.Context(options.endpoint) job_status = fts3.get_job_status(context, job_id, list_files=True) if job_status['job_state'] not in ['FINISHED', 'FINISHEDDIRTY', 'CANCELED', 'FAILED']: print "Sorry, job %s has not finished yet, its' status is %s" % (job_id, job_status['job_state']) sys.exit(0) if options.uniq: handle_uniq() sys.exit(0) if options.invert: notTransferedFiles = [(f['source_surl'], f['dest_surl']) for f in job_status['files'] if f['file_state'] in ['FAILED', 'CANCELED'] and not matches(f['reason'], reasons, options.verbose)] #and sanitize_error(f['reason']) not in reasons] else:
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job. The internal state of the object is updated depending on the monitoring result. In case the job is not found on the server, the status is set to 'Failed' Within a job, only the transfers having a `fileID` metadata are considered. This is to allow for multihop jobs doing a staging This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns: {FileID: { status, error } } Possible error numbers * errno.ESRCH: If the job does not exist on the server * errno.EDEADLK: In case the job and file status are inconsistent (see comments inside the code) """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) # The job is not found # Set its status to Failed and return except NotFound: self.status = 'Failed' return S_ERROR( errno.ESRCH, "FTSGUID %s not found on %s" % (self.ftsGUID, self.ftsServer)) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} # Make a copy, since we are potentially # deleting objects for fileDict in list(filesInfoList): file_state = fileDict['file_state'].capitalize() file_metadata = fileDict['file_metadata'] # previous version of the code did not have dictionary as # file_metadata if isinstance(file_metadata, dict): file_id = file_metadata.get('fileID') else: file_id = file_metadata # The transfer does not have a fileID attached to it # so it does not correspond to a file in our DB: skip it # (typical of jobs with different staging protocol == CTA) # We also remove it from the fileInfoList, such that it is # not considered for accounting if not file_id: filesInfoList.remove(fileDict) continue file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} # If the state of the file is final for FTS, set ftsGUID of the file to None, # such that it is "released" from this job and not updated anymore in future # monitoring calls if file_state in FTS3File.FTS_FINAL_STATES: filesStatus[file_id]['ftsGUID'] = None # If the file is not in a final state, but the job is, we return an error # FTS can have inconsistencies where the FTS Job is in a final state # but not all the files. # The inconsistencies are cleaned every hour on the FTS side. # https://its.cern.ch/jira/browse/FTS-1482 elif self.status in self.FINAL_STATES: return S_ERROR( errno.EDEADLK, "Job %s in a final state (%s) while File %s is not (%s)" % (self.ftsGUID, self.status, file_id, file_state)) statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 # We've removed all the intermediate transfers that we are not interested in # so we put this back into the monitoring data such that the accounting is done properly jobStatusDict['files'] = filesInfoList if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) total = len(filesInfoList) completed = sum([ statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES ]) self.completeness = int(100 * completed / total) return S_OK(filesStatus)
def monitorFTS3(self, full=False): if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") jobStatusDict = None try: if not self._fts3context: self._fts3context = fts3.Context(endpoint=self.FTSServer, request_class=ftsSSLRequest, verify=False) context = self._fts3context jobStatusDict = fts3.get_job_status(context, self.FTSGUID, list_files=True) except Exception as e: return S_ERROR("Error getting the job status %s" % e) self.Status = jobStatusDict['job_state'].capitalize() filesInfoList = jobStatusDict['files'] statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum( [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES]) self.Completeness = 100 * completed / total if not full: return S_OK(statusSummary) ftsFilesPrinted = False for fileDict in filesInfoList: sourceURL = fileDict['source_surl'] targetURL = fileDict['dest_surl'] fileStatus = fileDict['file_state'].capitalize() reason = fileDict['reason'] duration = fileDict['tx_duration'] candidateFile = None for ftsFile in self: if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL: candidateFile = ftsFile break if candidateFile is None: self._log.warn( 'FTSFile not found', 'Source: %s, Target: %s' % (sourceURL, targetURL)) if not ftsFilesPrinted: ftsFilesPrinted = True if not len(self): self._log.warn('Monitored FTS job is empty!') else: self._log.warn( 'All FTS files are:', '\n' + '\n'.join([ 'Source: %s, Target: %s' % (ftsFile.SourceSURL, ftsFile.TargetSURL) for ftsFile in self ])) else: candidateFile.Status = fileStatus candidateFile.Error = reason candidateFile._duration = duration if candidateFile.Status == "Failed": for missingSource in self.missingSourceErrors: if missingSource.match(reason): candidateFile.Error = "MissingSource" # # register successful files if self.Status in FTSJob.FINALSTATES: return self.finalize() return S_OK()
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns {FileID: { status, error } } """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context( endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() file_id = fileDict['file_metadata'] file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} # If the state of the file is final for FTS, set ftsGUID of the file to None, # such that it is "released" from this job and not updated anymore in future # monitoring calls if file_state in FTS3File.FTS_FINAL_STATES: filesStatus[file_id]['ftsGUID'] = None statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum([statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES]) self.completeness = 100 * completed / total return S_OK(filesStatus)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import fts3.rest.client.easy as fts3 from optparse import OptionParser opts = OptionParser() opts.add_option('-s', '--endpoint', dest='endpoint', default='https://fts3-pilot.cern.ch:8446') opts.add_option('-l', '--list', dest='list_files', default=False, action='store_true') (options, args) = opts.parse_args() if len(args) < 1: raise Exception('Need a job id') job_id = args[0] logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG) context = fts3.Context(options.endpoint) job_status = fts3.get_job_status(context, job_id, list_files=options.list_files) print json.dumps(job_status, indent=2)
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns {FileID: { status, error } } """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context( endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() file_id = fileDict['file_metadata'] file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} # If the state of the file is final for FTS, set ftsGUID of the file to None, # such that it is "released" from this job and not updated anymore in future # monitoring calls if file_state in FTS3File.FTS_FINAL_STATES: filesStatus[file_id]['ftsGUID'] = None # If the file is not in a final state, but the job is, we return an error # FTS can have inconsistencies where the FTS Job is in a final state # but not all the files. # The inconsistencies are cleaned every hour on the FTS side. # https://its.cern.ch/jira/browse/FTS-1482 elif self.status in self.FINAL_STATES: return S_ERROR(errno.EDEADLK, "Job %s in a final state (%s) while File %s is not (%s)" % (self.ftsGUID, self.status, file_id, file_state)) statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum([statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES]) self.completeness = 100 * completed / total return S_OK(filesStatus)
def killThread(self, thread_id, transfers): """This is the worker thread function for kill command. """ while True: transfer_list = transfers.get() self.logger.info("Starting thread %s" % (thread_id)) user = transfer_list[0]['username'] group = transfer_list[0]['user_group'] role = transfer_list[0]['user_role'] uiSetupScript = getattr(self.config, 'UISetupScript', None) self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy)) try: userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if not userDN: transfers.task_done() time.sleep(1) continue self.logger.debug("user DN: %s" % userDN) try: defaultDelegation = {'logger': self.logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)} if hasattr(self.config, "cache_area"): cache_area = self.config.cache_area defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) transfers.task_done() time.sleep(1) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group if group else '' defaultDelegation['role'] = role if group else '' self.logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) transfers.task_done() time.sleep(1) continue # TODO: take server from db, right now, take only the first of the list and assuming it valid for all try: # TODO: debug u added during info upload. To be fixed soon! For now worked around fts_server = transfer_list[0]['fts_instance'].split('u')[1] self.logger.info("Delegating proxy to %s" % fts_server) context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True) self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) self.logger.info("Proxy delegated. Grouping files by jobId") jobs = {} for fileToKill in transfer_list: # TODO: debug u added during info upload. To be fixed soon! For now worked around jid = str(fileToKill['fts_id']).split('u')[1] if jid not in jobs: jobs[jid] = [] jobs[jid].append(fileToKill) self.logger.info("Found %s jobIds", len(jobs.keys())) self.logger.debug("jobIds: %s", jobs.keys) # list for files killed or failed to killed = [] too_late = [] for ftsJobId, files in jobs.iteritems(): self.logger.info("Cancelling tranfers in %s" % ftsJobId) ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files] source_lfns = [x['source_lfn'] for x in files] job_list = fts3.get_job_status(context, ftsJobId, list_files=True) tx = job_list['files'] # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db? idListToKill = [x['file_id'] for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] # needed for the state update lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill)) res = fts3.cancel(context, ftsJobId, idListToKill) self.logger.debug('Kill command result: %s' % json.dumps(res)) if not isinstance(res, list): res = [res] # Verify if the kill command succeeded for k, kill_res in enumerate(res): indexToUpdate = lfnListToKill[k] if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"): self.logger.debug(source_lfns[indexToUpdate]) too_late.append(getHashLfn(source_lfns[indexToUpdate])) else: killed.append(getHashLfn(source_lfns[indexToUpdate])) # TODO: decide how to update status for too_late files killed += too_late self.logger.debug('Updating status of killed files: %s' % killed) if len(killed) > 0: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = killed data['list_of_transfer_state'] = ["KILLED" for _ in killed] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked killed %s" % killed) except: # TODO: split and improve try/except self.logger.exception('Kill command failed') transfers.task_done()
def worker(self, i, input): """ - get a token for fts - loop over users in queue - for each user get the list of jobid from filenames in Monitor/user folder - monitor the status of the job - if final, look the file statuses of the files - update the db state - remove file from the source (raise no critical error) :param i: id number of the thread :param inputs: users :return: """ if not self.config.TEST: context = fts3.Context(self.config_getter.serverFTS, self.config_getter.opsProxy, self.config_getter.opsProxy, verify=True) logger = self.logger # setProcessLogger('Mon'+str(i)) logger.info("Process %s is starting. PID %s", i, os.getpid()) Update = update(logger, self.config_getter) while not self.STOP: if input.empty(): time.sleep(10) continue try: user = input.get() except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) break for File in os.listdir('Monitor/' + user): job = File.split('.')[0] try: if not self.config.TEST: results = fts3.get_job_status(context, job, list_files=False) self.logger.info('Getting status for job: ' + job + ' ' + results['job_state']) else: time.sleep(random.randint(0, random.randint(0, 3))) lf = json.loads( open('Monitor/' + user + '/' + File).read()) if random.randint(0, random.randint(0, 5)) == 0: results = { 'job_state': 'FINISHED', 'files': [{ 'file_metadata': { 'lfn': x }, 'file_state': 'FINISHED' } for x in lf] } else: results = {'job_state': 'SUBMITTED'} self.logger.info('Getting status for job: ' + job + ' ' + results['job_state']) except Exception: logger.exception('Failed get job status for %s' % job) continue if results['job_state'] in [ 'FINISHED', 'FAILED', 'FINISHEDDIRTY', 'CANCELED' ]: if not self.config.TEST: try: results = fts3.get_job_status(context, job, list_files=True) except Exception: logger.exception( 'Failed get file statuses for %s' % job) continue self.logger.info('Updating status for job: ' + job) failed_lfn = list() failed_reasons = list() done_lfn = list() for Fl in results['files']: lfn = Fl['file_metadata']['lfn'] if Fl['file_state'] == 'FINISHED': done_lfn.append(lfn) else: failed_lfn.append(lfn) if Fl['reason'] is not None: self.logger.warning('Failure reason: ' + Fl['reason']) failed_reasons.append(Fl['reason']) else: self.logger.exception( 'Failure reason not found') failed_reasons.append( 'unable to get failure reason') try: logger.info( 'Marking job %s files done and %s files failed for job %s' % (len(done_lfn), len(failed_lfn), job)) doneReady = Update.transferred(done_lfn) failedReady = Update.failed(failed_lfn, failed_reasons) except Exception: logger.exception('Failed to update states') continue if doneReady == 1 or failedReady == 1: continue try: logger.info('Removing' + 'Monitor/' + user + '/' + File) os.rename('Monitor/' + user + '/' + File, 'Done/' + File) except Exception: logger.exception('failed to remove monitor file') continue input.task_done() self.active_users.remove(user) time.sleep(1) logger.debug("Worker %s exiting.", i)
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns {FileID: { status, error } } """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() file_id = fileDict['file_metadata'] file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum([ statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES ]) self.completeness = 100 * completed / total return S_OK(filesStatus)
const=-1, default=0, help='Do not resubmit any failed transfers') opts.add_option_group(loop_options) (options, args) = opts.parse_args() if len(args) < 1: opts.print_usage() sys.exit(1) job_id = args[0] context = fts3.Context(options.endpoint) if options.reg_endpoint: reg_context = fts3.Context(options.reg_endpoint) job_status = fts3.get_job_status(context, job_id, list_files=True) if job_status['job_state'] not in [ 'FINISHED', 'FINISHEDDIRTY', 'CANCELED', 'FAILED' ]: print "Sorry, job %s has not finished yet, its' status is %s" % ( job_id, job_status['job_state']) sys.exit(0) if job_status['job_state'] != 'FINISHED': print "The job had problems, its' status is %s" % job_status[ 'job_state'] transferedFiles = [ f['dest_surl'] for f in job_status['files'] if f['file_state'] in ['FINISHED'] or f['reason'] == 'DESTINATION file already exists and overwrite is not enabled'
def check_FTSJob(logger, ftsContext, jobid, jobsEnded, jobs_ongoing, done_id, failed_id, failed_reasons): """ get transfers state per jobid INPUT PARAMS :param logger: a logging object :param ftsContext: :param jobid: OUTPUT PARAMS :prarm jobsEnded: :param jobs_ongoing: :param done_id: :param failed_id: :param failed_reasons: - check if the fts job is in final state (FINISHED, FINISHEDDIRTY, CANCELED, FAILED) - get file transfers states and get corresponding oracle ID from FTS file metadata - update states on oracle """ logger.info("Getting state of job %s" % jobid) jobs_ongoing.append(jobid) try: status = fts3.get_job_status(ftsContext, jobid, list_files=False) except HTTPException as hte: logger.exception("failed to retrieve status for %s " % jobid) logger.exception("httpExeption headers %s " % hte.headers) if hte.status == 404: logger.exception("%s not found in FTS3 DB" % jobid) jobs_ongoing.remove(jobid) return except Exception: logger.exception("failed to retrieve status for %s " % jobid) return logger.info("State of job %s: %s" % (jobid, status["job_state"])) if status["job_state"] in [ 'FINISHED', 'FINISHEDDIRTY', "FAILED", "CANCELED" ]: jobsEnded.append(jobid) if status["job_state"] in [ 'ACTIVE', 'FINISHED', 'FINISHEDDIRTY', "FAILED", "CANCELED" ]: file_statuses = fts3.get_job_status(ftsContext, jobid, list_files=True)['files'] done_id[jobid] = [] failed_id[jobid] = [] failed_reasons[jobid] = [] files_to_remove = [] fileIds_to_remove = [] # get the job content from local file jobContentFileName = 'task_process/transfers/' + jobid + '.json' with open(jobContentFileName, 'r') as fp: fileIds = json.load(fp) for file_status in file_statuses: _id = file_status['file_metadata']['oracleId'] if not _id in fileIds: # this file xfer has been handled already in a previous iteration # nothing to do continue tx_state = file_status['file_state'] # xfers have only 3 terminal states: FINISHED, FAILED, and CANCELED see # https://fts3-docs.web.cern.ch/fts3-docs/docs/state_machine.html if tx_state == 'FINISHED': done_id[jobid].append(_id) files_to_remove.append(file_status['source_surl']) fileIds_to_remove.append(_id) elif tx_state == 'FAILED' or tx_state == 'CANCELED': failed_id[jobid].append(_id) if file_status['reason']: logger.info('Failure reason: ' + file_status['reason']) failed_reasons[jobid].append(file_status['reason']) else: logger.exception('Failure reason not found') failed_reasons[jobid].append( 'unable to get failure reason') files_to_remove.append(file_status['source_surl']) fileIds_to_remove.append(_id) else: # file transfer is not terminal: if status["job_state"] == 'ACTIVE': # if job is still ACTIVE file status will be updated in future run. See: # https://fts3-docs.web.cern.ch/fts3-docs/docs/state_machine.html pass else: # job status is terminal but file xfer status is not. # something went wrong inside FTS and a stuck transfers is waiting to be # removed by the reapStalledTransfers https://its.cern.ch/jira/browse/FTS-1714 # mark as failed failed_id[jobid].append(_id) logger.info('Failure reason: stuck inside FTS') failed_reasons[jobid].append(file_status['reason']) if files_to_remove: list_of_surls = '' # gfal commands take list of SURL as a list of blank-separated strings for f in files_to_remove: list_of_surls += str( f) + ' ' # convert JSON u'srm://....' to plain srm://... removeLogFile = './task_process/transfers/remove_files.log' remove_files_in_bkg(list_of_surls, removeLogFile) # remove those file Id's from the list and update the json disk file fileIds = list(set(fileIds) - set(fileIds_to_remove)) jobContentTmp = jobContentFileName + '.tmp' with open(jobContentTmp, 'w') as fp: json.dump(fileIds, fp) os.rename(jobContentTmp, jobContentFileName)
def lookAtFile(s, fN): tFN = fN.split("/")[-1] (ftsJID, stat, fIter, fServer) = getStatusForJob(s, tFN) if fServer == "-1": print "Unknown FTS job? Retry.", tFN shutil.move(fN, ceBase + "TODO/" + tFN) # if tFN.startswith("M"): # print "Unknown FTS job? Retry.", tFN # shutil.move(fN, ceBase + "TODO/" + tFN) # else: # print "Unknown FTS job?", tFN return 0 context = fts3.Context(fServer) try: jobStat = fts3.get_job_status(context, ftsJID, list_files=True) for fileInfo in jobStat['files']: # print(tfn,) reason = fileInfo["reason"] if "No such file or directory" in reason: # print tFN, fServer, reason # if "cern.ch" in fServer: # print "Failed for CERN FTS server : retry" # # shutil.move(fN, ceBase + "TODO/" + tFN) # return 0 if "SOURCE" in reason: print "Missing source : ", fileInfo["source_surl"].split( "SFN=" )[1], fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID elif "TRANSFER CHECKSUM MISMATCH" in reason: print "Transfer checksum mismatch - retrying ", tFN shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "Probably stalled" in reason: print "Stalled transfer - retrying ", tFN shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "SOURCE SRM_GET_TURL error on the turl" in reason: print "srm failure : probably diskserver was down. Retry" shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "Communication error on send" in reason: print "srm failure : Known (old) problem with RAL FTS system. Retry" shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "Transfer canceled because the gsiftp performance marker timeout" in reason: print "Recoverable error : 6 minute timeout exceeded. Retry" shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "bad data was encountered" in reason: print "Recoverable error : Command failed. : bad data was encountered. Retry" shutil.move(fN, ceBase + "TODO/" + tFN) return 0 elif "Command failed : error: commands denied" in reason: print "Recoverable error : Command failed : error: commands denied. Retry" shutil.move(fN, ceBase + "TODO/" + tFN) return 0 else: print tFN, fServer[: -1] + "9/fts3/ftsmon/#/job/" + ftsJID, fileInfo[ "reason"] print fileInfo["source_surl"].split("SFN=")[1] continue # print " .......... " # print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID # print " .......... " except: print "Could not find any information for ", tFN, ". Try the transfer again." shutil.move(fN, ceBase + "TODO/B" + tFN) return -1 # print jobStat return 0
def monitorFTS3( self, full = False ): if not self.FTSGUID: return S_ERROR( "FTSGUID not set, FTS job not submitted?" ) jobStatusDict = None try: if not self._fts3context: self._fts3context = fts3.Context( endpoint = self.FTSServer, request_class = ftsSSLRequest, verify = False ) context = self._fts3context jobStatusDict = fts3.get_job_status( context, self.FTSGUID, list_files = True ) except Exception as e: return S_ERROR( "Error getting the job status %s" % e ) self.Status = jobStatusDict['job_state'].capitalize() filesInfoList = jobStatusDict['files'] statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() statusSummary[file_state] = statusSummary.get( file_state, 0 ) + 1 total = len( filesInfoList ) completed = sum( [ statusSummary.get( state, 0 ) for state in FTSFile.FINAL_STATES ] ) self.Completeness = 100 * completed / total if not full: return S_OK( statusSummary ) ftsFilesPrinted = False for fileDict in filesInfoList: sourceURL = fileDict['source_surl'] targetURL = fileDict['dest_surl'] fileStatus = fileDict['file_state'].capitalize() reason = fileDict['reason'] duration = fileDict['tx_duration'] candidateFile = None for ftsFile in self: if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL : candidateFile = ftsFile break if candidateFile is None: self._log.warn( 'FTSFile not found', 'Source: %s, Target: %s' % ( sourceURL, targetURL ) ) if not ftsFilesPrinted: ftsFilesPrinted = True if not len( self ): self._log.warn( 'Monitored FTS job is empty!' ) else: self._log.warn( 'All FTS files are:', '\n' + '\n'.join( ['Source: %s, Target: %s' % ( ftsFile.SourceSURL, ftsFile.TargetSURL ) for ftsFile in self] ) ) else: candidateFile.Status = fileStatus candidateFile.Error = reason candidateFile._duration = duration if candidateFile.Status == "Failed": for missingSource in self.missingSourceErrors: if missingSource.match( reason ): candidateFile.Error = "MissingSource" # # register successful files if self.Status in FTSJob.FINALSTATES: return self.finalize() return S_OK()
import fts3.rest.client.easy as fts3 from optparse import OptionParser opts = OptionParser() opts.add_option('-s', '--endpoint', dest='endpoint', default='https://fts3-pilot.cern.ch:8446') opts.add_option('-l', '--list', dest='list_files', default=False, action='store_true') (options, args) = opts.parse_args() if len(args) < 1: raise Exception('Need a job id') job_id = args[0] logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG) context = fts3.Context(options.endpoint) job_status = fts3.get_job_status(context, job_id, list_files=options.list_files) print json.dumps(job_status, indent=2) jobs_statuses = fts3.get_jobs_statuses(context, [job_id], list_files=options.list_files) print json.dumps(job_status, indent=2)