def generateContext(ftsServer, ucert, lifetime=25200): """This method generates an fts3 context :param ftsServer: address of the fts3 server :param ucert: the path to the certificate to be used :param lifetime: duration (in sec) of the delegation to the FTS3 server (default is 7h, like FTS3 default) :returns: an fts3 context """ try: context = fts3.Context(endpoint=ftsServer, ucert=ucert, request_class=ftsSSLRequest, verify=False) # Explicitely delegate to be sure we have the lifetime we want # Note: the delegation will re-happen only when the FTS server # decides that there is not enough timeleft. # At the moment, this is 1 hour, which effectively means that if you do # not submit a job for more than 1h, you have no valid proxy in FTS servers # anymore. In future release of FTS3, the delegation will be triggered when # one third of the lifetime will be left. # Also, the proxy given as parameter might have less than "lifetime" left # since it is cached, but it does not matter, because in the FTS3Agent # we make sure that we renew it often enough # Finally, FTS3 has an issue with handling the lifetime of the proxy, # because it does not check all the chain. This is under discussion # https://its.cern.ch/jira/browse/FTS-1575 fts3.delegate(context, lifetime=datetime.timedelta(seconds=lifetime)) return S_OK(context) except FTS3ClientException as e: gLogger.exception("Error generating context", repr(e)) return S_ERROR(repr(e))
def __call__(self): """ a. makes the RESTFTS job b. submits FTS c. update status and create dropbox json """ stdout, stderr, rc = None, None, 99999 #fts_url_delegation = self.fts_server_for_transfer.replace('8446', '8443') if self.user_proxy: try: jobs, jobs_lfn, jobs_pfn, jobs_report = self.files_for_transfer() self.context = fts3.Context(self.fts_server_for_transfer, self.user_proxy, self.user_proxy, verify=True) self.logger.debug(fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False)) init_time = str(time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())) except: self.logger.exception('delegation failed. Marking failed all documents.') for link, lfns in jobs_lfn.iteritems(): self.logger.exception('Marking failed: %s' % lfns) self.mark_failed(lfns, force_fail=True, submission_error=True) return self.logger.debug("Processing files for %s " % self.user_proxy) if jobs: jobReport = self.command(jobs, jobs_lfn, jobs_pfn, jobs_report) updated = False while not updated: updated = self.update_FTSJobID(jobReport) else: self.logger.debug("User proxy of %s could not be delagated! Trying next time." % self.user) self.logger.info('Transfers completed') return
def algorithm(): """ script algorithm - create fts REST HTTPRequest - delegate user proxy to fts if needed - check for fts jobs to monitor and update states in oracle - get last line from last_transfer.txt - gather list of file to transfers + group by source + submit ftsjob and save fts jobid + update info in oracle - append new fts job ids to fts_jobids.txt """ # TODO: pass by configuration fts = HTTPRequests('fts3.cern.ch:8446/', proxy, proxy) ftsContext = fts3.Context('https://fts3.cern.ch:8446', proxy, proxy, verify=True) logging.debug("Delegating proxy to FTS: "+fts3.delegate(ftsContext, lifetime=timedelta(hours=48), force=False)) try: phedex = PhEDEx(responseType='xml', httpDict={'key': proxy, 'cert': proxy, 'pycurl':True}) except Exception as e: logging.exception('PhEDEx exception: %s', e) return jobs_ongoing = state_manager(fts) new_jobs = submission_manager(phedex, ftsContext) logging.debug("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs) return
def submit(proxy, toTrans, source, destination): # prepare rest job with 200 files per job transfers = [] for files in chunks(toTrans, 200): c = pycurl.Curl() # create destination and source pfns for job for lfn in files: print(lfn) transfers.append( fts3.new_transfer(apply_tfc_to_lfn(source, lfn, c), apply_tfc_to_lfn(destination, lfn, c))) c.close() # Submit fts job context = fts3.Context('https://fts3.cern.ch:8446', proxy, proxy, verify=True) print(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) job = fts3.new_job(transfers) #print("Monitor link: https://fts3.cern.ch:8449/fts3/ftsmon/#/job/"+fts3.submit(context, job)) jobid = fts3.submit(context, job) #for file in (fts3.get_job_status(context, jobid, list_files=True))["files"]: for key, value in (fts3.get_job_status(context, jobid, list_files=True)).iteritems(): print key
def delegate_proxy(self, proxy, ca_path='/etc/grid-security/certificates/', duration_hours=96, timeleft_hours=72): """Delegate user proxy to fts server if the lifetime is less than timeleft_hours :param proxy: proxy to be delegated :type proxy: str :param ca_path: ca path for verification, defaults to '/etc/grid-security/certificates/' :param ca_path: str, optional :param duration_hours: delegation validity duration in hours, defaults to 48 :param duration_hours: int, optional :param timeleft_hours: minimal delegation time left, defaults to 12 :param timeleft_hours: int, optional :return: delegation ID :rtype: str """ logging.info("Delegating proxy %s to %s", proxy, self.external_host) start_time = time.time() try: context = Context(self.external_host, ucert=proxy, ukey=proxy, verify=True, capath=ca_path) delegation_id = delegate( context, lifetime=datetime.timedelta(hours=duration_hours), delegate_when_lifetime_lt=datetime.timedelta( hours=timeleft_hours)) record_timer('transfertool.fts3.delegate_proxy.success.%s' % proxy, (time.time() - start_time)) except ServerError: logging.error("Server side exception during FTS proxy delegation.") record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except ClientError: logging.error("Config side exception during FTS proxy delegation.") record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except BadEndpoint: logging.error("Wrong FTS endpoint: %s", self.external_host) record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except ReadTimeout as error: raise TransferToolTimeout(error) except JSONDecodeError as error: raise TransferToolWrongAnswer(error) logging.info("Delegated proxy %s", delegation_id) return delegation_id, context
def fts3_delegate(fts3_endpoint='https://fts3-pilot.cern.ch:8446'): if voms_proxy_expired(): print("INFO: creating new proxy.") proxy = voms_proxy_init() if proxy: print("Proxy info:") print("path: {}".format(proxy['path'])) print("expiration: {}".format(proxy['expiration'])) print("timestamp: {}".format(proxy['TS'])) else: print("FATAL: proxy creation failed.") return else: proxy = voms_proxy_info() print("INFO: proxy valid, avoiding recreation.") fts3_context = context = fts3.Context(fts3_endpoint, verify=True) whoami = fts3.whoami(fts3_context) no_valid_delegation = False termination_time = datetime.utcnow() elapsed_threshold = timedelta(hours=1) try: delegation_ID = whoami['delegation_id'] check_delegation_json = fts3_check_delegation(delegation_ID, proxy, fts3_endpoint) if check_delegation_json: termination_time = datetime.strptime( check_delegation_json['termination_time'].replace('T', ' '), '%Y-%m-%d %H:%M:%S') print('INFO: Delegation valid until {} UTC'.format( termination_time.strftime('%H:%M:%S %Y-%m-%d'))) else: no_valid_delegation = True except: no_valid_delegation = False if no_valid_delegation: print("INFO: no valid delegation found") if (termination_time - elapsed_threshold) < datetime.utcnow() or no_valid_delegation: print('INFO: Renewing delegation!') delegation_ID_2 = fts3.delegate(fts3_context, lifetime=timedelta(hours=12), force=True) print('INFO: New delegation ID = {}'.format(delegation_ID_2)) else: print('INFO: Nothing to do...')
def algorithm(self): """ - delegate and use opsproxy (once every 12h) - Look into Monitor user folders and if the user is not in the queue put it there :return: """ # TODO: monitor is probably better with multiproc workers = list() for i in range(self.config.max_threads_num): worker = Thread(target=self.worker, args=(i, self.q)) worker.setDaemon(True) worker.start() workers.append(worker) count = 0 while not self.STOP: if count == 0 and not self.config.TEST: self.context = fts3.Context(self.config_getter.serverFTS, self.config_getter.opsProxy, self.config_getter.opsProxy, verify=True) self.logger.debug( fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False)) for folder in os.listdir('Monitor'): user = folder jobs = os.listdir('Monitor/' + user) if not len(jobs) == 0 and user not in self.active_users: self.active_users.append(user) self.q.put(user) elif len(jobs) == 0 and user in self.active_users: self.active_users.remove(user) if count < 6 * 60 * 12: # delegate every 12h count += 1 else: count = 0 self.logger.info('%s active users' % len(self.active_users)) self.logger.debug('Active users are: %s' % self.active_users) self.logger.debug('Queue lenght: %s' % self.q.qsize()) time.sleep(10) for w in workers: w.join() self.logger.info('Monitor stopped.')
def __call__(self): """ a. makes the RESTFTS job b. submits FTS c. update status and create dropbox json """ stdout, stderr, rc = None, None, 99999 #fts_url_delegation = self.fts_server_for_transfer.replace('8446', '8443') if self.user_proxy: try: self.context = fts3.Context(self.fts_server_for_transfer, self.user_proxy, self.user_proxy, verify=True) self.logger.debug(fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False)) init_time = str(time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())) jobs, jobs_lfn, jobs_pfn, jobs_report = self.files_for_transfer() except: self.logger.exception('delegation failed') self.logger.debug("Processing files for %s " % self.user_proxy) if jobs: self.command(jobs, jobs_lfn, jobs_pfn, jobs_report) else: self.logger.debug("User proxy of %s could not be delagated! Trying next time." % self.user) self.logger.info('Transfers completed') return
def killThread(self, thread_id, transfers): """This is the worker thread function for kill command. """ while True: transfer_list = transfers.get() self.logger.info("Starting thread %s" % (thread_id)) user = transfer_list[0]['username'] group = transfer_list[0]['user_group'] role = transfer_list[0]['user_role'] uiSetupScript = getattr(self.config, 'UISetupScript', None) self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy)) try: userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if not userDN: transfers.task_done() time.sleep(1) continue self.logger.debug("user DN: %s" % userDN) try: defaultDelegation = {'logger': self.logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)} if hasattr(self.config, "cache_area"): cache_area = self.config.cache_area defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) transfers.task_done() time.sleep(1) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group if group else '' defaultDelegation['role'] = role if group else '' self.logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) transfers.task_done() time.sleep(1) continue # TODO: take server from db, right now, take only the first of the list and assuming it valid for all try: # TODO: debug u added during info upload. To be fixed soon! For now worked around fts_server = transfer_list[0]['fts_instance'].split('u')[1] self.logger.info("Delegating proxy to %s" % fts_server) context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True) self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) self.logger.info("Proxy delegated. Grouping files by jobId") jobs = {} for fileToKill in transfer_list: # TODO: debug u added during info upload. To be fixed soon! For now worked around jid = str(fileToKill['fts_id']).split('u')[1] if jid not in jobs: jobs[jid] = [] jobs[jid].append(fileToKill) self.logger.info("Found %s jobIds", len(jobs.keys())) self.logger.debug("jobIds: %s", jobs.keys) # list for files killed or failed to killed = [] too_late = [] for ftsJobId, files in jobs.iteritems(): self.logger.info("Cancelling tranfers in %s" % ftsJobId) ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files] source_lfns = [x['source_lfn'] for x in files] job_list = fts3.get_job_status(context, ftsJobId, list_files=True) tx = job_list['files'] # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db? idListToKill = [x['file_id'] for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] # needed for the state update lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill)) res = fts3.cancel(context, ftsJobId, idListToKill) self.logger.debug('Kill command result: %s' % json.dumps(res)) if not isinstance(res, list): res = [res] # Verify if the kill command succeeded for k, kill_res in enumerate(res): indexToUpdate = lfnListToKill[k] if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"): self.logger.debug(source_lfns[indexToUpdate]) too_late.append(getHashLfn(source_lfns[indexToUpdate])) else: killed.append(getHashLfn(source_lfns[indexToUpdate])) # TODO: decide how to update status for too_late files killed += too_late self.logger.debug('Updating status of killed files: %s' % killed) if len(killed) > 0: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = killed data['list_of_transfer_state'] = ["KILLED" for _ in killed] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked killed %s" % killed) except: # TODO: split and improve try/except self.logger.exception('Kill command failed') transfers.task_done()
def algorithm(): """ script algorithm - create fts REST HTTPRequest - delegate user proxy to fts if needed - check for fts jobs to monitor and update states in oracle - get last line from last_transfer.txt - gather list of file to transfers + group by source + submit ftsjob and save fts jobid + update info in oracle - append new fts job ids to fts_jobids.txt """ # TODO: pass by configuration fts = HTTPRequests(hostname=FTS_ENDPOINT.split("https://")[1], localcert=proxy, localkey=proxy) logging.info("using user's proxy from %s", proxy) ftsContext = fts3.Context(FTS_ENDPOINT, proxy, proxy, verify=True) logging.info("Delegating proxy to FTS...") delegationId = fts3.delegate(ftsContext, lifetime=timedelta(hours=48), delegate_when_lifetime_lt=timedelta(hours=24), force=False) delegationStatus = fts.get("delegation/"+delegationId) logging.info("Delegated proxy valid until %s", delegationStatus[0]['termination_time']) # instantiate an object to talk with CRAB REST server try: crabserver = CRABRest(restInfo['host'], localcert=proxy, localkey=proxy, userAgent='CRABSchedd') crabserver.setDbInstance(restInfo['dbInstance']) except Exception: logging.exception("Failed to set connection to crabserver") return with open("task_process/transfers.txt") as _list: _data = _list.readlines()[0] try: doc = json.loads(_data) username = doc["username"] taskname = doc["taskname"] destination = doc["destination"] except Exception as ex: msg = "Username gathering failed with\n%s" % str(ex) logging.warn(msg) raise ex try: logging.info("Initializing Rucio client") os.environ["X509_USER_PROXY"] = proxy logging.info("Initializing Rucio client for %s", taskname) rucioClient = CRABDataInjector(taskname, destination, account=username, scope="user."+username, auth_type='x509_proxy') except Exception as exc: msg = "Rucio initialization failed with\n%s" % str(exc) logging.warn(msg) raise exc jobs_ongoing = state_manager(fts, crabserver) new_jobs = submission_manager(rucioClient, ftsContext, crabserver) logging.info("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs) return
def algorithm(): """ script algorithm - instantiates FTS3 python easy client - delegate user proxy to fts if needed - check for fts jobs to monitor and update states in oracle - get last line from last_transfer.txt - gather list of file to transfers + group by source + submit ftsjob and save fts jobid + update info in oracle - append new fts job ids to fts_jobids.txt """ logging.info("using user's proxy from %s", proxy) ftsContext = fts3.Context(FTS_ENDPOINT, proxy, proxy, verify=True) logging.info("Delegating proxy to FTS...") delegationId = fts3.delegate( ftsContext, lifetime=timedelta(hours=48), # pylint: disable=unused-variable delegate_when_lifetime_lt=timedelta(hours=24), force=False) # we never had problems with delegation since we put proper delegate_when in the above line # but if need to check delegation arise, it can be done with a query like # curl --cert proxy --key proxy https://fts3-cms.cern.ch:8446/delegation/<delegatinId> # see: https://fts3-docs.web.cern.ch/fts3-docs/fts-rest/docs/api.html#get-delegationdlgid # instantiate an object to talk with CRAB REST server try: crabserver = CRABRest(restInfo['host'], localcert=proxy, localkey=proxy, userAgent='CRABSchedd') crabserver.setDbInstance(restInfo['dbInstance']) except Exception: logging.exception("Failed to set connection to crabserver") return with open("task_process/transfers.txt") as _list: _data = _list.readlines()[0] try: doc = json.loads(_data) username = doc["username"] taskname = doc["taskname"] destination = doc["destination"] except Exception as ex: msg = "Username gathering failed with\n%s" % str(ex) logging.warn(msg) raise ex try: logging.info("Initializing Rucio client") os.environ["X509_USER_PROXY"] = proxy os.environ["X509_USER_PROXY"] = proxy rucioClient = rucioclient(account=username, auth_type='x509_proxy') except Exception as exc: msg = "Rucio initialization failed with\n%s" % str(exc) logging.warn(msg) raise exc jobs_ongoing = state_manager(ftsContext, crabserver) new_jobs = submission_manager(rucioClient, ftsContext, crabserver) logging.info("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs) return
def worker(self, i, inputs): """ - Retrieve userDN - Retrieve user proxy - Delegate proxy to fts is needed - submit fts job - update doc states :param i: thread number :param inputs: tuple (lfns, _user, source, dest, tfc_map) :return: """ # TODO: differentiate log messages per USER! logger = self.logger logger.info("Process %s is starting. PID %s", i, os.getpid()) lock = Lock() Update = update(logger, self.config) while not self.STOP: if inputs.empty(): time.sleep(10) continue try: lfns, _user, source, dest, tfc_map = inputs.get() [user, group, role] = _user except (EOFError, IOError): crashMessage = "Hit EOF/IO in getting new work\n" crashMessage += "Assuming this is a graceful break attempt.\n" logger.error(crashMessage) continue start = time.time() if not self.config.TEST: try: userDN = getDNFromUserName(user, logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: logger.exception('Cannot retrieve user DN') self.critical_failure(lfns, lock, inputs) continue defaultDelegation = { 'logger': logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': '', 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False) } cache_area = self.config.cache_area try: defaultDelegation['myproxyAccount'] = re.compile( 'https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: logger.error( 'MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) self.critical_failure(lfns, lock, inputs) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group defaultDelegation['role'] = role logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, logger) if not valid_proxy: logger.error( 'Failed to retrieve user proxy... putting docs on retry' ) logger.error( 'docs on retry: %s' % Update.failed(lfns, submission_error=True)) continue except Exception: logger.exception('Error retrieving proxy') self.critical_failure(lfns, lock, inputs) continue else: user_proxy = self.config.opsProxy self.logger.debug("Using opsProxy for testmode") context = dict() try: if self.config.TEST: logger.debug("Running in test mode, submitting fake jobs") else: context = fts3.Context(self.config.serverFTS, user_proxy, user_proxy, verify=True) logger.debug( fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) except Exception: logger.exception("Error submitting to FTS") self.critical_failure(lfns, lock, inputs) continue failed_lfn = list() try: if self.config.TEST: submitted_lfn = lfns jobid = getHashLfn(lfns[0][0]) self.logger.debug('Fake job id: ' + jobid) else: failed_lfn, submitted_lfn, jobid = Submission( lfns, source, dest, i, self.logger, fts3, context, tfc_map) if jobid == -1: self.critical_failure(lfns, lock, inputs) continue logger.info('Submitted %s files' % len(submitted_lfn)) except Exception: logger.exception("Unexpected error during FTS job submission!") self.critical_failure(lfns, lock, inputs) continue # TODO: add file FTS id and job id columns for kill command try: Update.submitted(lfns) except Exception: logger.exception("Error updating document status") self.critical_failure(lfns, lock, inputs) continue try: Update.failed(failed_lfn) except Exception: logger.exception( "Error updating document status, job submission will be retried later..." ) self.critical_failure(lfns, lock, inputs) continue try: createLogdir('Monitor/' + user) with open('Monitor/' + user + '/' + str(jobid) + '.txt', 'w') as outfile: json.dump(lfns, outfile) logger.info('Monitor files created') except Exception: logger.exception("Error creating file for monitor") self.critical_failure(lfns, lock, inputs) continue end = time.time() self.logger.info('Input processed in %s', str(end - start)) time.sleep(0.5) logger.debug("Worker %s exiting.", i) return 0