def submitFTS3(self, pinTime=False): """ submit fts job using FTS3 rest API """ if self.FTSGUID: return S_ERROR("FTSJob already has been submitted") transfers = [] for ftsFile in self: trans = fts3.new_transfer(ftsFile.SourceSURL, ftsFile.TargetSURL, checksum=ftsFile.Checksum, filesize=ftsFile.Size) transfers.append(trans) source_spacetoken = self.SourceToken if self.SourceToken else None dest_spacetoken = self.TargetToken if self.TargetToken else None copy_pin_lifetime = pinTime if pinTime else None bring_online = 86400 if pinTime else None job = fts3.new_job(transfers=transfers, overwrite=True, source_spacetoken=source_spacetoken, spacetoken=dest_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3) try: context = fts3.Context(self.FTSServer) self.FTSGUID = fts3.submit(context, job) except Exception, e: return S_ERROR("Error at submission: %s" % e)
def generateContext(ftsServer, ucert, lifetime=25200): """This method generates an fts3 context :param ftsServer: address of the fts3 server :param ucert: the path to the certificate to be used :param lifetime: duration (in sec) of the delegation to the FTS3 server (default is 7h, like FTS3 default) :returns: an fts3 context """ try: context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Explicitely delegate to be sure we have the lifetime we want # Note: the delegation will re-happen only when the FTS server # decides that there is not enough timeleft. # At the moment, this is 1 hour, which effectively means that if you do # not submit a job for more than 1h, you have no valid proxy in FTS servers # anymore. In future release of FTS3, the delegation will be triggered when # one third of the lifetime will be left. # Also, the proxy given as parameter might have less than "lifetime" left # since it is cached, but it does not matter, because in the FTS3Agent # we make sure that we renew it often enough # Finally, FTS3 has an issue with handling the lifetime of the proxy, # because it does not check all the chain. This is under discussion # https://its.cern.ch/jira/browse/FTS-1575 fts3.delegate(context, lifetime=datetime.timedelta(seconds=lifetime)) return S_OK(context) except FTS3ClientException as e: gLogger.exception("Error generating context", repr(e)) return S_ERROR(repr(e))
def _banStorageElement(self, storageElement): endpoints = getFTS3Servers()['Value'] blacklist = {} for endpoint in endpoints: # endpoint = 'https://fts3-pilot.cern.ch:8446' # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API proxyPath = getProxyInfo() if not proxyPath['OK']: return proxyPath try: proxyPath = proxyPath['Value']['path'] except Exception as e: return S_ERROR(repr(e).replace(',)', ')')) context = fts3.Context(endpoint, proxyPath) timeout = 3600 # or...? status = 'wait' # or...? allow_submit = False # or...? # TODO: ban_se returns the list of jobIDs interrupted by the banning pausedJobIDs = fts3.ban_se(context, storageElement, status, timeout, allow_submit) self.log.info("fts3.ban_se: %s" % pausedJobIDs) blacklist[endpoint] = json.loads(context.get("ban/se")) return S_OK(blacklist)
def recoverFINISHEDDIRTY(s, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName) # This set of files has transferred successfully. Move it to DONE directory # print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] missFiles = [] for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue reason = fileInfo["reason"] if "Probably stalled" in reason: failedFiles.append( (fileInfo["source_surl"], fileInfo["dest_surl"])) elif "globus_ftp_control_local_pasv failed" in reason: failedFiles.append( (fileInfo["source_surl"], fileInfo["dest_surl"])) elif "500 No such file or directory" in reason: print fServer, fileInfo["source_surl"], reason print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID missFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) else: print ftsFileName, fileInfo["source_surl"], fileInfo["reason"][:50] failedFiles.append( (fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles cleanUpTransfer(failedFiles, ftsFileName) writeTransfer(failedFiles, "TODO/", "D", ftsFileName) writeTransfer(missFiles, "DONE/Bad/", "M", ftsFileName)
def _banStorageElement(self, storageElement): endpoints = getFTS3Servers()['Value'] blacklist = {} for endpoint in endpoints: #endpoint = 'https://fts3-pilot.cern.ch:8446' #TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API proxyPath = getProxyInfo() if not proxyPath.get('OK'): return S_ERROR("Proxy not found!") try: proxyPath = proxyPath.get('Value').get('path') except Exception as e: return S_ERROR(e.message) context = fts3.Context(endpoint, proxyPath) timeout = 3600 #or...? status = 'wait' #or...? allow_submit = False #or...? #TODO: ban_se returns the list of jobIDs interrupted by the banning pausedJobIDs = fts3.ban_se(context, storageElement, status, timeout, allow_submit) blacklist[endpoint] = json.loads(context.get("ban/se")) return S_OK(blacklist) ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
def _unbanStorageElement(self, storageElement): endpoints = getFTS3Servers()['Value'] blacklist = {} for endpoint in endpoints: #endpoint = 'https://fts3-pilot.cern.ch:8446' #TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API proxyPath = getProxyInfo() if not proxyPath.get('OK'): return S_ERROR("Proxy not found!") try: proxyPath = proxyPath.get('Value').get('path') except Exception as e: return S_ERROR(e.message) context = fts3.Context(endpoint, proxyPath) fts3.unban_se(context, storageElement) blacklist[endpoint] = json.loads(context.get("ban/se")) return S_OK(blacklist)
def submit(proxy, toTrans, source, destination): # prepare rest job with 200 files per job transfers = [] for files in chunks(toTrans, 200): c = pycurl.Curl() # create destination and source pfns for job for lfn in files: print(lfn) transfers.append( fts3.new_transfer(apply_tfc_to_lfn(source, lfn, c), apply_tfc_to_lfn(destination, lfn, c))) c.close() # Submit fts job context = fts3.Context('https://fts3.cern.ch:8446', proxy, proxy, verify=True) print(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) job = fts3.new_job(transfers) #print("Monitor link: https://fts3.cern.ch:8449/fts3/ftsmon/#/job/"+fts3.submit(context, job)) jobid = fts3.submit(context, job) #for file in (fts3.get_job_status(context, jobid, list_files=True))["files"]: for key, value in (fts3.get_job_status(context, jobid, list_files=True)).iteritems(): print key
def submitTheFTSJob(ftsFile): ### First way : Random choice of two servers # ftsServ = random.choice([ftsServ1, ftsServ2]) ### Second way : Weighted choice of two servers # rndValue = random.uniform(0.0,1.0) # ftsServ = ftsServ1 # if rndValue < 0.7 : ftsServ = ftsServ2 ### Third way : Random choice of three servers fList = [ftsServ1, ftsServ2, ftsServ3] ftsServ = random.choice(fList) # context = fts3.Context(ftsServ) # Open the file and stop the processing. listOfPairs[:] = open(ceBase + "DOING/" + ftsFile).read().split("\n") # listOfPairs[:] = open(ceBase + "TODO/" + ftsFile).read().split("\n") # All the threading bit is here to check in parallel whether the files we are looking are okay in castor # Once the function is done, the list "okayFiles" should be filled transfers = [] if checkStatus: checkStatusOnCastor() else: okayFiles[:] = [] for onePair in listOfPairs: if len(onePair)<10: continue (sourceSURL, targetSURL) = onePair.split(" ") okayFiles.append((sourceSURL, targetSURL)) if len(okayFiles)>0: for oneSet in okayFiles: transf = fts3.new_transfer(oneSet[0], oneSet[1]) transfers.append(transf) job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=False, retry=5) # requested by Andrea Manzi ftsJobID = fts3.submit(context, job) return ftsJobID, ftsServ else: # None of the files in this lot were good! return "-1", "-1"
def _unbanStorageElement(self, storageElement): endpoints = getFTS3Servers() if not endpoints['OK']: return endpoints endpoints = endpoints['Value'] blacklist = {} for endpoint in endpoints: # endpoint = 'https://fts3-pilot.cern.ch:8446' # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API proxyPath = getProxyInfo() if not proxyPath['OK']: return proxyPath try: proxyPath = proxyPath['Value']['path'] except Exception as e: return S_ERROR(repr(e).replace(',)', ')')) context = fts3.Context(endpoint, proxyPath) fts3.unban_se(context, storageElement) blacklist[endpoint] = json.loads(context.get("ban/se")) return S_OK(blacklist)
def __init__(self, config, quiet, debug, test=False): """ :param config: :param quiet: :param debug: :param test: """ # TODO: use test in input to set self.TEST self.config_getter = config.Getter self.config = config.Monitor self.TEST = False createLogdir('Done') def setRootLogger(quiet, debug): """ Taken from CRABServer TaskWorker Sets the root logger with the desired verbosity level The root logger logs to logs/asolog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :return logger: a logger with the appropriate logger level.""" createLogdir('logs') if self.TEST: # if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog('logs/monitor.txt', when='midnight') logFormatter = \ logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s") logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setProcessLogger("master") logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.STOP = False self.logger = setRootLogger(quiet, debug) self.active_users = list() self.q = Queue() self.context = fts3.Context(self.config_getter.serverFTS, self.config_getter.opsProxy, self.config_getter.opsProxy, verify=True)
def getNewStatus(s, f, fid=""): if len(fid) < 3: (fid, fstat, fIter, fServer) = getStatusForJob(s, f) if fid == "-1": print "File ", f, "not submitted to FTS?" return "Unknown-notsubmitted", -1, 0, "-1" context = fts3.Context(fServer) ftsStat = fts3.get_job_status(context, fid) return ftsStat["job_state"], fid, ftsStat, fServer
def monitorFTS3(self, full=False): if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") jobStatusDict = None try: context = fts3.Context(endpoint=self.FTSServer) jobStatusDict = fts3.get_job_status(context, self.FTSGUID, list_files=True) except Exception, e: return S_ERROR("Error getting the job status %s" % e)
def getNewStatus(s, f, fid="", context=0): if len(fid) < 3 : (fid, fstat, fIter, fServer) = getStatusForJob(s, f) if fid == "-1": return "Unknown-notsubmitted", -1, 0, "-1" if context == 0: context = fts3.Context(fServer) try: ftsStat = fts3.get_job_status(context, fid) return ftsStat["job_state"], fid, ftsStat, fServer except: print "File ", f, "unknown to FTS?" return "Unknow", -1, 0, "-1"
def fts3_delegate(fts3_endpoint='https://fts3-pilot.cern.ch:8446'): if voms_proxy_expired(): print("INFO: creating new proxy.") proxy = voms_proxy_init() if proxy: print("Proxy info:") print("path: {}".format(proxy['path'])) print("expiration: {}".format(proxy['expiration'])) print("timestamp: {}".format(proxy['TS'])) else: print("FATAL: proxy creation failed.") return else: proxy = voms_proxy_info() print("INFO: proxy valid, avoiding recreation.") fts3_context = context = fts3.Context(fts3_endpoint, verify=True) whoami = fts3.whoami(fts3_context) no_valid_delegation = False termination_time = datetime.utcnow() elapsed_threshold = timedelta(hours=1) try: delegation_ID = whoami['delegation_id'] check_delegation_json = fts3_check_delegation(delegation_ID, proxy, fts3_endpoint) if check_delegation_json: termination_time = datetime.strptime( check_delegation_json['termination_time'].replace('T', ' '), '%Y-%m-%d %H:%M:%S') print('INFO: Delegation valid until {} UTC'.format( termination_time.strftime('%H:%M:%S %Y-%m-%d'))) else: no_valid_delegation = True except: no_valid_delegation = False if no_valid_delegation: print("INFO: no valid delegation found") if (termination_time - elapsed_threshold) < datetime.utcnow() or no_valid_delegation: print('INFO: Renewing delegation!') delegation_ID_2 = fts3.delegate(fts3_context, lifetime=timedelta(hours=12), force=True) print('INFO: New delegation ID = {}'.format(delegation_ID_2)) else: print('INFO: Nothing to do...')
def submitTheFTSJob(ftsFile): ### First way : Random choice of two servers # ftsServ = random.choice([ftsServ1, ftsServ2]) ### Second way : Weighted choice of two servers # rndValue = random.uniform(0.0,1.0) # ftsServ = ftsServ1 # if rndValue < 0.7 : ftsServ = ftsServ2 ### Third way : Random choice of three servers fList = [ftsServ1, ftsServ2, ftsServ3] ftsServ = random.choice(fList) # context = fts3.Context(ftsServ) filecontent = open(ceBase + "DOING/" + ftsFile).read().split("\n") transfers = [] for ftra in filecontent: if len(ftra) < 10: continue (sourceSURL, targetSURL) = ftra.split(" ") comm = "gfal-stat " + sourceSURL runComm = subprocess.Popen(comm, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) theInfo = runComm.communicate()[1].strip() if theInfo.startswith( "gfal-stat error: 2 (No such file or directory)"): bFTS = open(ceBase + "DONE/badFileList.txt", "a") bFTS.write(ftra + "\n") bFTS.close() else: transf = fts3.new_transfer(sourceSURL, targetSURL) transfers.append(transf) # transf = fts3.new_transfer(sourceSURL, targetSURL) # transfers.append(transf) if len(transfers) > 0: # job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=True, retry=5) # job = fts3.new_job(transfers=transfers, overwrite=True, verify_checksum=True, reuse=False, retry=5) # requested by Andrea Manzi job = fts3.new_job( transfers=transfers, overwrite=True, verify_checksum=True, reuse=False, retry=0) # To avoid deleted files snarling up the system for hours ftsJobID = fts3.submit(context, job, delegation_lifetime=fts3.timedelta(hours=72)) return ftsJobID, ftsServ else: return "-1", "-1"
def algorithm(self): """ - delegate and use opsproxy (once every 12h) - Look into Monitor user folders and if the user is not in the queue put it there :return: """ # TODO: monitor is probably better with multiproc workers = list() for i in range(self.config.max_threads_num): worker = Thread(target=self.worker, args=(i, self.q)) worker.setDaemon(True) worker.start() workers.append(worker) count = 0 while not self.STOP: if count == 0 and not self.config.TEST: self.context = fts3.Context(self.config_getter.serverFTS, self.config_getter.opsProxy, self.config_getter.opsProxy, verify=True) self.logger.debug( fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False)) for folder in os.listdir('Monitor'): user = folder jobs = os.listdir('Monitor/' + user) if not len(jobs) == 0 and user not in self.active_users: self.active_users.append(user) self.q.put(user) elif len(jobs) == 0 and user in self.active_users: self.active_users.remove(user) if count < 6 * 60 * 12: # delegate every 12h count += 1 else: count = 0 self.logger.info('%s active users' % len(self.active_users)) self.logger.debug('Active users are: %s' % self.active_users) self.logger.debug('Queue lenght: %s' % self.q.qsize()) time.sleep(10) for w in workers: w.join() self.logger.info('Monitor stopped.')
def submitFTS3(self, pinTime=False): """ submit fts job using FTS3 rest API """ if self.FTSGUID: return S_ERROR("FTSJob already has been submitted") transfers = [] for ftsFile in self: trans = fts3.new_transfer(ftsFile.SourceSURL, ftsFile.TargetSURL, checksum='ADLER32:%s' % ftsFile.Checksum, filesize=ftsFile.Size) transfers.append(trans) source_spacetoken = self.SourceToken if self.SourceToken else None dest_spacetoken = self.TargetToken if self.TargetToken else None copy_pin_lifetime = pinTime if pinTime else None bring_online = 86400 if pinTime else None job = fts3.new_job(transfers=transfers, overwrite=True, source_spacetoken=source_spacetoken, spacetoken=dest_spacetoken, bring_online=bring_online, copy_pin_lifetime=copy_pin_lifetime, retry=3) try: if not self._fts3context: self._fts3context = fts3.Context(endpoint=self.FTSServer, request_class=ftsSSLRequest, verify=False) context = self._fts3context self.FTSGUID = fts3.submit(context, job) except Exception as e: return S_ERROR("Error at submission: %s" % e) self.Status = "Submitted" self._log = gLogger.getSubLogger( "req_%s/FTSJob-%s" % (self.RequestID, self.FTSGUID), True) for ftsFile in self: ftsFile.FTSGUID = self.FTSGUID ftsFile.Status = "Submitted" return S_OK()
def recoverFINISHEDDIRTY(s, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName) # This set of files has transferred successfully. Move it to DONE directory print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles for fF in failedFiles: print fF cleanUpTransfer(failedFiles, ftsFileName) retryFailedTransfer(failedFiles, ftsFileName)
def generateContext(ftsServer, ucert): """ This method generates an fts3 context :param ftsServer: address of the fts3 server :param ucert: the path to the certificate to be used :returns: an fts3 context """ try: context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) return S_OK(context) except FTS3ClientException as e: gLogger.exception("Error generating context", repr(e)) return S_ERROR(repr(e))
def _do_ftscall(self, binding=None, url=None): if self._context is None: # request_class = Request -> use "requests"-based https call (instead of default PyCURL, # which may not be able to handle proxy certificates depending on the cURL installation) # verify = False -> do not verify the server certificate context = fts3.Context(self.server_url, ucert=self.x509proxy, ukey=self.x509proxy, request_class=Request, verify=False) if self.keep_context: self._context = context else: context = self._context if binding is not None: reqstring = binding[0] else: reqstring = url LOG.debug('FTS: %s', reqstring) wait_time = 1. for attempt in xrange(10): try: if binding is not None: method, args, kwd = binding return getattr(fts3, method)(context, *args, **kwd) else: return json.loads(context.get(url)) except fts_exceptions.ServerError as exc: if str(exc.reason) == '500': # Internal server error - let's try again pass except fts_exceptions.TryAgain: pass time.sleep(wait_time) wait_time *= 1.5 LOG.error('Failed to communicate with FTS server: %s', reqstring) raise RuntimeError('Failed to communicate with FTS server: %s' % reqstring)
def recoverFINISHEDDIRTY(s, ftsFileName): (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName) if status == "Unknown": # Try again ... shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName) return # This set of files has transferred successfully. Move it to DONE directory print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "." if ftsJID == -1: print "Probably in old sqlite dB. Could not check - retry" shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName) return context = fts3.Context(fServer) jobStat = fts3.get_job_status(context, ftsJID, list_files=True) failedFiles = [] for fileInfo in jobStat['files']: if fileInfo["file_state"] == "FINISHED": continue failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"])) # print failedFiles if len(failedFiles) < 1: return cleanUpTransfer(failedFiles, ftsFileName) retryFailedTransfer(failedFiles, ftsFileName)
def _banStorageElement(self, storageElement): endpoints = getFTS3Servers() if not endpoints['OK']: return endpoints endpoints = endpoints['Value'] blacklist = {} for endpoint in endpoints: # endpoint = 'https://fts3-pilot.cern.ch:8446' # TODO: maybe proxyPath is not needed since it is picked from the environment by the REST API proxyPath = getProxyInfo() if not proxyPath['OK']: return proxyPath try: proxyPath = proxyPath['Value']['path'] except Exception as e: return S_ERROR(repr(e).replace(',)', ')')) context = fts3.Context(endpoint, proxyPath) status = 'wait' # This status leaves the jobs queued. The only alternative is "cancel" pausedJobIDs = fts3.ban_se(context, storageElement, status, timeout=3600, allow_submit=False) self.log.info("fts3.ban_se: paused jobs: %s" % ','.join(pausedJobIDs)) blacklist[endpoint] = json.loads(context.get("ban/se")) return S_OK(blacklist)
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns {FileID: { status, error } } """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() file_id = fileDict['file_metadata'] file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum([ statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES ]) self.completeness = 100 * completed / total return S_OK(filesStatus)
def main(): parser = argparse.ArgumentParser(description="Run FTS Datalake Tests") parser.add_argument("-i", required=True, dest="conf_file", help="Configuration file") parser.add_argument("--cleanup", required=False, action='store_true', default=False, help="Clean up src/dst directories") parser.add_argument("--exit", required=False, action='store_true', default=False, help="Exit after cleanup") arg = parser.parse_args() conf_file = str(arg.conf_file) cleanup = arg.cleanup exit = arg.exit # open configuration file to get test details with open(conf_file) as json_file: data = json.load(json_file) # assign json variables protocol_map = data['protocols'] num_of_files_list = data['num_of_files'] filesize_list = data['filesizes'] num_of_jobs = data['num_of_jobs'] testing_folder = data['testing_folder'] checksum = data["checksum"] overwrite = data["overwrite"] metadata = data['metadata'] # figure out the unique endpoints from the configuration endpoints = [] endpoint_tlist = [] for protocol in protocol_map: protocol_endpoints = protocol_map[protocol] for endpoint in protocol_endpoints: # example: endpoint = door05.pic.es:8452//rucio/pic_dcache endpoint_t = endpoint.split(":", 1)[0] # example: endpoint_t = door05.pic.es endpoint_e = re.split('[0-9]*', endpoint.split(":", 1)[1], 1)[1] # example: endpoint_e = //rucio/pic_dcache endpoint_ft = endpoint_t + endpoint_e if endpoint_ft not in endpoint_tlist: endpoint_tlist.append(endpoint_ft) endpoints.append("{}://{}".format(protocol, endpoint)) del endpoint_tlist # setup folders at the testing endpoints if needed _flush_logging_msg("Setting up folders at endpoints") prob_endpoints = _gfal_setup_folders(endpoints, testing_folder, cleanup) # we have some problematic endpoints if prob_endpoints: _flush_logging_msg( "Problematic endpoints (will not be tested): {})".format( prob_endpoints)) # the script is used as a setup script so do not perform testing if exit: sys.exit(1) # ---------------------------------------------------------------------- # authenticate @ FTS endpoint # https://gitlab.cern.ch/fts/fts-rest/-/blob/develop/src/fts3/rest/client/context.py#L148 _flush_logging_msg('Authenticating at {}'.format(FTS_ENDPOINT)) context = fts3.Context(FTS_ENDPOINT, verify=True) # list that holds a dictionary per each job # this is later used to poll for the jobs until they finish job_map_list = [] # for every job for _ in xrange(num_of_jobs): # for every protocol to be checked for protocol in protocol_map: # get endpoints protocol_endpoints = protocol_map[protocol] # create unique pairs of 2s (source destionation) endpnt_pairs = itertools.permutations(protocol_endpoints, 2) # for every pair for endpnt_pair in endpnt_pairs: # ad-hoc temp solution for lapp-webdav - remove checksum if endpnt_pair[0] == "lapp-esc02.in2p3.fr:8001/webdav": checksum = "none" if endpnt_pair[1] == "lapp-esc02.in2p3.fr:8001/webdav": checksum = "none" # -- abort_source = False source_url = "{}://{}".format(protocol, endpnt_pair[0]) dest_url = "{}://{}".format(protocol, endpnt_pair[1]) # if the source endpoint is faulty, abort this run if endpnt_pair[0] in prob_endpoints: _flush_logging_msg("Aborting run for source: {}".format( endpnt_pair[0])) continue _flush_logging_msg( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") _flush_logging_msg("Source: {}".format(source_url)) _flush_logging_msg("Destination: {}".format(dest_url)) # for every filesize combination for filesize in filesize_list: if abort_source: _flush_logging_msg( "Aborting run for source: {}".format( source_url)) break # for every files per job combination for numfile in num_of_files_list: # configure destination filenames local_file_paths = [] dest_filenames = [] for nfile in xrange(numfile): random_suffix = str(uuid.uuid1()) random_filename = "{}.{}".format( FILE_PREFIX, random_suffix) dest_filenames.append(random_filename) file_path = os.path.join( LOCALPATH_TEMP_DIR, random_filename) local_file_paths.append(str(file_path)) source_dir = os.path.join(source_url, testing_folder, "src") # check if source has adequate number of files of # the desired filesize _flush_logging_msg( "Checking source for {} existing {}MB files". format(numfile, filesize)) src_filenames = _gfal_check_files( source_dir, filesize, numfile) if src_filenames == -1: abort_source = True _flush_logging_msg( "Aborting run for source: {}".format( source_url)) break remove_local_files = False if not src_filenames: remove_local_files = True for filename in dest_filenames: src_filename = "{}_{}mb".format( filename, filesize) src_filenames.append(src_filename) # generate random files localy _flush_logging_msg( "Locally generating {} random files of size:{}MB" .format(numfile, filesize)) for file_path in local_file_paths: with open(file_path, 'wb') as fout: fout.write(os.urandom(filesize * MB)) # upload files to the source for this job _flush_logging_msg("Uploading files to source") rcode = _gfal_upload_files( local_file_paths, source_dir, src_filenames) if rcode == -1: abort_source = True _flush_logging_msg( "Aborting run for source: {}".format( source_url)) break # submit fts transfer _flush_logging_msg('Submitting FTS job') job_id = _fts_submit_job(source_url, dest_url, src_filenames, dest_filenames, checksum, overwrite, testing_folder, context, metadata) if job_id == -1: _flush_logging_msg('Job aborted') continue _flush_logging_msg('FTS job id:{}'.format(job_id)) job_map = {} job_map['job_id'] = job_id job_map['directory'] = os.path.join( dest_url, testing_folder, "dest") job_map['files_to_purge'] = dest_filenames job_map_list.append(job_map) if remove_local_files: # remove files locally _flush_logging_msg( "Removing files from LOCALPATH: {}".format( LOCALPATH_TEMP_DIR)) for file in local_file_paths: os.remove(file) _flush_logging_msg(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") _fts_wait_jobs(context, job_map_list) _flush_logging_msg("Testing DONE, program is going to exit now!")
def submit(self, context=None, ftsServer=None, ucert=None, pinTime=36000, protocols=None): """ submit the job to the FTS server Some attributes are expected to be defined for the submission to work: * type (set by FTS3Operation) * sourceSE (only for Transfer jobs) * targetSE * activity (optional) * priority (optional) * username * userGroup * filesToSubmit * operationID (optional, used as metadata for the job) We also expect the FTSFiles have an ID defined, as it is given as transfer metadata :param pinTime: Time the file should be pinned on disk (used for transfers and staging) Used only if he source SE is a tape storage :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be inferred by the fts cli (see its doc) :param protocols: list of protocols from which we should choose the protocol to use :returns S_OK([FTSFiles ids of files submitted]) """ log = gLogger.getSubLogger( "submit/%s/%s_%s" % (self.operationID, self.sourceSE, self.targetSE), True) if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Construct the target SURL res = self.__fetchSpaceToken(self.targetSE) if not res['OK']: return res target_spacetoken = res['Value'] allLFNs = [ftsFile.lfn for ftsFile in self.filesToSubmit] if self.type == 'Transfer': res = self._constructTransferJob(pinTime, allLFNs, target_spacetoken, protocols=protocols) elif self.type == 'Staging': res = self._constructStagingJob(pinTime, allLFNs, target_spacetoken) # elif self.type == 'Removal': # res = self._constructRemovalJob(context, allLFNs, failedLFNs, target_spacetoken) if not res['OK']: return res job, fileIDsInTheJob = res['Value'] setFileIdsInTheJob = set(fileIDsInTheJob) try: self.ftsGUID = fts3.submit(context, job) log.info("Got GUID %s" % self.ftsGUID) # Only increase the amount of attempt # if we succeeded in submitting -> no ! Why did I do that ?? for ftsFile in self.filesToSubmit: ftsFile.attempt += 1 if ftsFile.fileID in setFileIdsInTheJob: ftsFile.status = 'Submitted' now = datetime.datetime.utcnow().replace(microsecond=0) self.submitTime = now self.lastUpdate = now self.lastMonitor = now except FTS3ClientException as e: log.exception("Error at submission", repr(e)) return S_ERROR("Error at submission: %s" % e) return S_OK(fileIDsInTheJob)
def monitor(self, context=None, ftsServer=None, ucert=None): """ Queries the fts server to monitor the job. The internal state of the object is updated depending on the monitoring result. In case the job is not found on the server, the status is set to 'Failed' Within a job, only the transfers having a `fileID` metadata are considered. This is to allow for multihop jobs doing a staging This method assumes that the attribute self.ftsGUID is set :param context: fts3 context. If not given, it is created (see ftsServer & ucert param) :param ftsServer: the address of the fts server to submit to. Used only if context is not given. if not given either, use the ftsServer object attribute :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc) :returns: {FileID: { status, error } } Possible error numbers * errno.ESRCH: If the job does not exist on the server * errno.EDEADLK: In case the job and file status are inconsistent (see comments inside the code) """ if not self.ftsGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") if not context: if not ftsServer: ftsServer = self.ftsServer context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) jobStatusDict = None try: jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True) # The job is not found # Set its status to Failed and return except NotFound: self.status = 'Failed' return S_ERROR( errno.ESRCH, "FTSGUID %s not found on %s" % (self.ftsGUID, self.ftsServer)) except FTS3ClientException as e: return S_ERROR("Error getting the job status %s" % e) now = datetime.datetime.utcnow().replace(microsecond=0) self.lastMonitor = now newStatus = jobStatusDict['job_state'].capitalize() if newStatus != self.status: self.status = newStatus self.lastUpdate = now self.error = jobStatusDict['reason'] if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) filesInfoList = jobStatusDict['files'] filesStatus = {} statusSummary = {} # Make a copy, since we are potentially # deleting objects for fileDict in list(filesInfoList): file_state = fileDict['file_state'].capitalize() file_metadata = fileDict['file_metadata'] # previous version of the code did not have dictionary as # file_metadata if isinstance(file_metadata, dict): file_id = file_metadata.get('fileID') else: file_id = file_metadata # The transfer does not have a fileID attached to it # so it does not correspond to a file in our DB: skip it # (typical of jobs with different staging protocol == CTA) # We also remove it from the fileInfoList, such that it is # not considered for accounting if not file_id: filesInfoList.remove(fileDict) continue file_error = fileDict['reason'] filesStatus[file_id] = {'status': file_state, 'error': file_error} # If the state of the file is final for FTS, set ftsGUID of the file to None, # such that it is "released" from this job and not updated anymore in future # monitoring calls if file_state in FTS3File.FTS_FINAL_STATES: filesStatus[file_id]['ftsGUID'] = None # If the file is not in a final state, but the job is, we return an error # FTS can have inconsistencies where the FTS Job is in a final state # but not all the files. # The inconsistencies are cleaned every hour on the FTS side. # https://its.cern.ch/jira/browse/FTS-1482 elif self.status in self.FINAL_STATES: return S_ERROR( errno.EDEADLK, "Job %s in a final state (%s) while File %s is not (%s)" % (self.ftsGUID, self.status, file_id, file_state)) statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 # We've removed all the intermediate transfers that we are not interested in # so we put this back into the monitoring data such that the accounting is done properly jobStatusDict['files'] = filesInfoList if newStatus in self.FINAL_STATES: self._fillAccountingDict(jobStatusDict) total = len(filesInfoList) completed = sum([ statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES ]) self.completeness = int(100 * completed / total) return S_OK(filesStatus)
def monitorFTS3(self, full=False): if not self.FTSGUID: return S_ERROR("FTSGUID not set, FTS job not submitted?") jobStatusDict = None try: if not self._fts3context: self._fts3context = fts3.Context(endpoint=self.FTSServer, request_class=ftsSSLRequest, verify=False) context = self._fts3context jobStatusDict = fts3.get_job_status(context, self.FTSGUID, list_files=True) except Exception as e: return S_ERROR("Error getting the job status %s" % e) self.Status = jobStatusDict['job_state'].capitalize() filesInfoList = jobStatusDict['files'] statusSummary = {} for fileDict in filesInfoList: file_state = fileDict['file_state'].capitalize() statusSummary[file_state] = statusSummary.get(file_state, 0) + 1 total = len(filesInfoList) completed = sum( [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES]) self.Completeness = 100 * completed / total if not full: return S_OK(statusSummary) ftsFilesPrinted = False for fileDict in filesInfoList: sourceURL = fileDict['source_surl'] targetURL = fileDict['dest_surl'] fileStatus = fileDict['file_state'].capitalize() reason = fileDict['reason'] duration = fileDict['tx_duration'] candidateFile = None for ftsFile in self: if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL: candidateFile = ftsFile break if candidateFile is None: self._log.warn( 'FTSFile not found', 'Source: %s, Target: %s' % (sourceURL, targetURL)) if not ftsFilesPrinted: ftsFilesPrinted = True if not len(self): self._log.warn('Monitored FTS job is empty!') else: self._log.warn( 'All FTS files are:', '\n' + '\n'.join([ 'Source: %s, Target: %s' % (ftsFile.SourceSURL, ftsFile.TargetSURL) for ftsFile in self ])) else: candidateFile.Status = fileStatus candidateFile.Error = reason candidateFile._duration = duration if candidateFile.Status == "Failed": for missingSource in self.missingSourceErrors: if missingSource.match(reason): candidateFile.Error = "MissingSource" # # register successful files if self.Status in FTSJob.FINALSTATES: return self.finalize() return S_OK()
return "-1", "-1" sess = doTheSQLiteAndGetItsPointer() # submittedFiles = glob.glob(ceBase + "DOING/*.txt") submittedFiles = glob.glob(ceBase + "DOING/RALSpecific/*.txt") # print getStatusForJob(sess, "R-uFTSList-02062018-230631.txt") kount = 0 for ff in submittedFiles: tFN = ff.split("/")[-1] # tFN = "R-uFTSList-02062018-230631.txt" (fID, fStat, fIter, fServ) = getStatusForJob(sess, tFN) if fServ != "https://lcgfts3.gridpp.rl.ac.uk:8446": # continue sys.exit() context = fts3.Context(fServ) # We have jobs submitted to the RAL FTS server only # First cancel the job # print "Cancelling job : ", fID, " file :", tFN # stat = fts3.cancel(context, fID) shutil.move(ceBase + "DOING/RALSpecific/" + tFN, ceBase + "DOING/" + tFN) ftsJobID, ftsServ = submitTheFTSJob(context, tFN) print "Submitted file : ", tFN, " with fts ID : ", ftsJobID, " to server ", ftsServ, " URL:", ftsServ[: -1] + "9/fts3/ftsmon/#/job/" + ftsJobID # shutil.move(ceBase + "DOING/" + tFN, ceBase + "DOING/RALSpecific/" + tFN) if ftsJobID == "-1": continue #Now I have a pair - write them to the SQLite DB. m = sess.query(ftsjob).filter(ftsjob.ftsFile == tFN).all() if m: m = m[0] m.ftsID = ftsJobID
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import fts3.rest.client.easy as fts3 from optparse import OptionParser opts = OptionParser() opts.add_option('-s', '--endpoint', dest='endpoint', default='https://fts3-pilot.cern.ch:8446') (options, args) = opts.parse_args() logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG) context = fts3.Context(options.endpoint) snapshot = fts3.get_snapshot(context) print json.dumps(snapshot, indent=2)
jobFiles = glob.glob(ceBase + "DOING/*.txt") for jFile in jobFiles: # print jFile jobID = sess.query(ftsjob).filter( ftsjob.ftsFile == jFile.split("/")[-1]).all() if len(jobID) > 0: jobID = jobID[0] else: print jobID continue ftsJID = jobID.ftsID.strip() ftsServer = jobID.ftsServer.strip() # if not ("fts3.gridpp" in ftsServer): continue try: context = fts3.Context(ftsServer) except Exception, e: print "Exception creating FTS context ", e continue # print ftsServer print "Cancelling job : ", ftsJID, " file :", jFile.split( "/")[-1], " status ", jobID.ftsStatus.strip() try: stat = fts3.cancel(context, ftsJID) except Exception, e: print "Exception in cancelling : ", e # sys.exit() continue # for jobID in rows: # if jobID.ftsStatus.strip() in finalStatuses: continue