def failed(self, files, failures_reasons=[], max_retry=3, force_fail=False, submission_error=False): """ :param files: tuple (source_lfn, dest_lfn) :param failures_reasons: list(str) with reasons of failure :param max_retry: number of retry before giving up :param force_fail: flag for triggering failure without retry :param submission_error: error during fts submission :return: """ updated_lfn = [] for Lfn in files: lfn = Lfn[0] # Load document and get the retry_count docId = getHashLfn(lfn) self.logger.debug("Marking failed %s" % docId) try: docbyId = self.oracleDB.get(self.config.oracleUserFileTrans.replace('filetransfer','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] self.logger.debug("Document: %s" % document) except Exception as ex: self.logger.error("Error updating failed docs: %s" % ex) return 1 fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = docId if not len(failures_reasons) == 0: try: fileDoc['list_of_failure_reason'] = failures_reasons[files.index(Lfn)] except: fileDoc['list_of_failure_reason'] = "unexcpected error, missing reasons" self.logger.exception("missing reasons") if force_fail or document['transfer_retry_count'] + 1 > max_retry: fileDoc['list_of_transfer_state'] = 'FAILED' fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_transfer_state'] = 'RETRY' if submission_error: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_retry_value'] = 1 self.logger.debug("update: %s" % fileDoc) try: updated_lfn.append(docId) self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception('ERROR updating failed documents') return 1 self.logger.debug("failed file updated") return 0
def algorithm(self, parameters=None): """ Performs the doRetries method, loading the appropriate plugin for each job and handling it. """ logging.debug("Running retryManager algorithm") if self.config.isOracle: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") return logging.info("Retried files in cooloff: %s,\n now getting transfers to kill" % str(results)) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'getTransfersToKill' fileDoc['grouping'] = 0 try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result = oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get killed transfers \ from oracleDB: %s" % ex) return usersToKill = list(set([(x['username'], x['user_group'], x['user_role']) for x in result])) self.logger.debug("Users with transfers to kill: %s" % usersToKill) transfers = Queue() for i in range(self.config.kill_threads): worker = Thread(target=self.killThread, args=(i, transfers,)) worker.setDaemon(True) worker.start() for user in usersToKill: user_trans = [x for x in result if (x['username'], x['user_group'], x['user_role']) == user] self.logger.info("Inserting %s transfers of user %s in the killing queue" % (len(user_trans), user)) transfers.put(user_trans) transfers.join() self.logger.info("Transfers killed.") else: self.doRetries()
def active_tasks(self, db): fileDoc = {} fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquirePublication' self.logger.debug("Retrieving publications from oracleDB") results = '' try: results = db.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" %ex) return [] fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredPublication' fileDoc['grouping'] = 0 fileDoc['limit'] = 100000 self.logger.debug("Retrieving max.100000 acquired publications from oracleDB") result = [] try: results = db.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result.extend(oracleOutputMapping(results)) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" %ex) return [] self.logger.debug("publen: %s" % len(result)) self.logger.debug("%s acquired publications retrieved" % len(result)) #TODO: join query for publisher (same of submitter) unique_tasks = [list(i) for i in set(tuple([x['username'], x['user_group'], x['user_role'], x['taskname']] ) for x in result if x['transfer_state'] == 3)] info = [] for task in unique_tasks: info.append([x for x in result if x['taskname'] == task[3]]) return zip(unique_tasks, info)
def mark_failed(files, oracleDB, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ h = 0 for lfn in files: h += 1 logger.debug("Marking failed %s" % h) source_lfn = lfn docId = getHashLfn(source_lfn) logger.debug("Marking failed %s" % docId) try: docbyId = oracleDB.get(config.General.oracleUserTrans, data=encodeRequest({ 'subresource': 'getById', 'id': docId })) except Exception: logger.exception("Error updating failed docs.") continue document = oracleOutputMapping(docbyId, None)[0] logger.debug("Document: %s" % document) try: fileDoc = dict() fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = docId fileDoc['list_of_publication_state'] = 'FAILED' #if force_failure or document['publish_retry_count'] > self.max_retry: # fileDoc['list_of_publication_state'] = 'FAILED' #else: # fileDoc['list_of_publication_state'] = 'RETRY' # TODO: implement retry fileDoc['list_of_retry_value'] = 1 fileDoc['list_of_failure_reason'] = failure_reason logger.debug("fileDoc: %s " % fileDoc) _ = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(fileDoc)) logger.debug("updated: %s " % docId) except Exception as ex: msg = "Error updating document: %s" % fileDoc msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) continue
def mark_failed(ids, failures_reasons, oracleDB): """ Mark the list of files as failed :param ids: list of Oracle file ids to update :param failures_reasons: list of strings with transfer failure messages :return: 0 success, 1 failure """ os.environ["X509_CERT_DIR"] = os.getcwd() if len(ids) > 0: try: data = dict() data['asoworker'] = 'rucio' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["FAILED" for _ in ids] data['list_of_failure_reason'] = failures_reasons data['list_of_retry_value'] = [0 for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.info("Marked failed %s", ids) except Exception: logging.exception("Error updating documents") return None else: logging.info("Nothing to update (Failed)") return ids
def mark_good(workflow, files, oracleDB, logger): """ Mark the list of files as tranferred """ wfnamemsg = "%s: " % workflow for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) logger.info(wfnamemsg+msg) data['asoworker'] = 'asodciangot1' data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'DONE' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = '' try: result = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(data)) logger.debug("updated: %s %s " % (docId, result)) except Exception as ex: logger.error("Error during status update: %s" %ex)
def mark_failed(ids, failures_reasons): """ Mark the list of files as failed :param ids: list of Oracle file ids to update :param failures_reasons: list of strings with transfer failure messages :return: 0 success, 1 failure """ try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) data = dict() data['asoworker'] = 'asoless' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["FAILED" for _ in ids] data['list_of_failure_reason'] = failures_reasons data['list_of_retry_value'] = [0 for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.debug("Marked failed %s", ids) except Exception: logging.exception("Error updating documents") return 1 return 0
def mark_transferred(ids, server): """ Mark the list of files as tranferred :param ids: list of Oracle file ids to update :return: 0 success, 1 failure """ try: print("Marking done %d files" % len(ids)) data = dict() data['asoworker'] = asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["DONE" for _ in ids] t1 = time.time() server.post(api='filetransfers', data=encodeRequest(data)) t2 = time.time() print("Marked good") elapsed = int(t2 - t1) except Exception as ex: t2 = time.time() elapsed = int(t2 - t1) print("Error updating documents:\n %s" % str(ex)) return elapsed
def getPublDescFiles(self, workflow, lfn_ready, logger): """ Download and read the files describing what needs to be published CRAB REST does not have any good way to select from the DB only what we need most efficient way is to get full list for the task, and then trim it here see: https://github.com/dmwm/CRABServer/issues/6124 """ out = [] dataDict = {} dataDict['taskname'] = workflow dataDict['filetype'] = 'EDM' data = encodeRequest(dataDict) try: res = self.crabServer.get(api='filemetadata', data=data) # res is a 3-plu: (result, exit code, status) res = res[0] except Exception as ex: logger.error( "Error during metadata retrieving from crabserver:\n%s", ex) return out metadataList = [json.loads(md) for md in res['result'] ] # CRAB REST returns a list of JSON objects for md in metadataList: # pick only the metadata we need if md['lfn'] in lfn_ready: out.append(md) logger.info('Got filemetadata for %d LFNs', len(out)) return out
def getAcquired(self, users): """ Get a number of documents to be submitted (in ACQUIRED status) and return results of the query for logs :return: """ documents = list() for user in users: username = user[0] group = user[1] role = user[2] fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 1 fileDoc['username'] = username if group == '': group = None if role == '': role = None fileDoc['vogroup'] = group fileDoc['vorole'] = role self.logger.debug("Retrieving users from oracleDB") try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) documents += oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get acquired transfers \ from oracleDB: %s" % ex) return documents
def mark_good(files, crabServer, logger): """ Mark the list of files as tranferred """ msg = "Marking %s file(s) as published." % len(files) logger.info(msg) if dryRun: logger.info("DryRun: skip marking good file") return nMarked = 0 for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = [docId] data['list_of_publication_state'] = ['DONE'] data['list_of_retry_value'] = [1] data['list_of_failure_reason'] = [''] try: result = crabServer.post(api='filetransfers', data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked)
def mark_transferred(ids): """ Mark the list of files as tranferred :param ids: list of Oracle file ids to update :return: 0 success, 1 failure """ try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) logging.debug("Marking done %s", ids) data = dict() data['asoworker'] = 'asoless' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["DONE" for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.debug("Marked good %s", ids) except Exception: logging.exception("Error updating documents") return 1 return 0
def mark_acquired(self, files=[]): """ Mark the list of files as tranferred """ lfn_in_transfer = [] dash_rep = () if self.config.isOracle: toUpdate = list() for lfn in files: if lfn['value'][0].find('temp') == 7: docId = lfn['key'][5] toUpdate.append(docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] dash_rep = (document['jobid'], document['job_retry_count'], document['taskname']) lfn_in_transfer.append(lfn) except Exception as ex: self.logger.error("Error during dashboard report update: %s" %ex) return [],() return lfn_in_transfer, dash_rep else: for lfn in files: if lfn['value'][0].find('temp') == 7: docId = getHashLfn(lfn['value'][0]) self.logger.debug("Marking acquired %s" % docId) # Load document to get the retry_count try: document = self.db.document(docId) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] == 'new' or document['state'] == 'retry': data = dict() data['state'] = 'acquired' data['last_update'] = time.time() updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) try: self.db.makeRequest(uri=updateUri, type="PUT", decode=False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("Marked acquired %s of %s" % (docId, lfn)) lfn_in_transfer.append(lfn) dash_rep = (document['jobid'], document['job_retry_count'], document['workflow']) else: continue else: good_lfn = lfn['value'][0].replace('store', 'store/temp', 1) self.mark_good([good_lfn]) return lfn_in_transfer, dash_rep
def mark_failed(files, crabServer, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ msg = "Marking %s file(s) as failed" % len(files) logger.info(msg) if dryRun: logger.debug("DryRun: skip marking failes files") return nMarked = 0 for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) data = dict() data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = [docId] data['list_of_publication_state'] = ['FAILED'] data['list_of_retry_value'] = [1] data['list_of_failure_reason'] = [failure_reason] logger.debug("data: %s ", data) try: result = crabServer.post(api='filetransfers', data=encodeRequest(data)) logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error("Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 if nMarked % 10 == 0: logger.info('marked %d files', nMarked)
def pubFailed(self, task, files, failure_reasons=list(), force_failure=False): """ :param files: :param failure_reasons: :return: """ id_list = list() for Lfn in files: source_lfn = Lfn[0] docId = getHashLfn(source_lfn) id_list.append(docId) self.logger.debug("Marking failed %s" % docId) fileDoc = dict() fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = id_list fileDoc['list_of_publication_state'] = ['FAILED' for x in id_list] # TODO: implement retry, publish_retry_count missing from input? fileDoc['list_of_retry_value'] = [1 for x in id_list] fileDoc['list_of_failure_reason'] = failure_reasons try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("updated failed: %s " % id_list) except Exception: msg = "Error updating failed documents" self.logger.exception(msg)
def pubDone(self, workflow, files): """ :param files: :param workflow: :return: """ wfnamemsg = "%s: " % workflow data = dict() id_list = list() for lfn in files: source_lfn = lfn docId = getHashLfn(source_lfn) id_list.append(docId) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) self.logger.info(wfnamemsg + msg) data['asoworker'] = self.config.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = id_list data['list_of_publication_state'] = ['DONE' for x in id_list] try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("updated done: %s " % id_list) except Exception as ex: self.logger.error("Error during status update for published docs: %s" % ex)
def mark_good(workflow, files, oracleDB, logger): """ Mark the list of files as tranferred """ wfnamemsg = "%s: " % workflow for lfn in files: data = {} source_lfn = lfn docId = getHashLfn(source_lfn) msg = "Marking file %s as published." % lfn msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn) logger.info(wfnamemsg+msg) data['asoworker'] = config.General.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'DONE' data['list_of_retry_value'] = 1 data['list_of_failure_reason'] = '' try: result = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(data)) logger.debug("updated: %s %s " % (docId, result)) except Exception as ex: logger.error("Error during status update: %s" %ex)
def transferred(self, files): """ Mark the list of files as tranferred """ good_ids = list() updated_lfn = list() try: for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) self.logger.debug("Marking done %s" % lfn) self.logger.debug("Marking done %s" % docId) data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception("Error updating documents") return 1 return 0
def mark_failed(files, oracleDB, logger, failure_reason=""): """ Something failed for these files so increment the retry count """ h = 0 for lfn in files: h += 1 logger.debug("Marking failed %s" % h) source_lfn = lfn docId = getHashLfn(source_lfn) logger.debug("Marking failed %s" % docId) try: docbyId = oracleDB.get(config.General.oracleUserTrans, data=encodeRequest({'subresource': 'getById', 'id': docId})) except Exception: logger.exception("Error updating failed docs.") continue document = oracleOutputMapping(docbyId, None)[0] logger.debug("Document: %s" % document) try: fileDoc = dict() fileDoc['asoworker'] = config.General.asoworker fileDoc['subresource'] = 'updatePublication' fileDoc['list_of_ids'] = docId fileDoc['list_of_publication_state'] = 'FAILED' #if force_failure or document['publish_retry_count'] > self.max_retry: # fileDoc['list_of_publication_state'] = 'FAILED' #else: # fileDoc['list_of_publication_state'] = 'RETRY' # TODO: implement retry fileDoc['list_of_retry_value'] = 1 fileDoc['list_of_failure_reason'] = failure_reason logger.debug("fileDoc: %s " % fileDoc) _ = oracleDB.post(config.General.oracleFileTrans, data=encodeRequest(fileDoc)) logger.debug("updated: %s " % docId) except Exception as ex: msg = "Error updating document: %s" % fileDoc msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) continue
def acquire(self): """ Get a number (1k for current oracle rest) of documents and bind them to this aso NEW -> ACQUIRED (asoworker NULL -> config.asoworker) :return: """ self.logger.info('Retrieving users...') fileDoc = dict() fileDoc['subresource'] = 'activeUsers' fileDoc['grouping'] = 0 fileDoc['asoworker'] = self.config.asoworker try: self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return 1 users = list() try: docs = oracleOutputMapping(result) users = [[x['username'], x['user_group'], x['user_role']] for x in docs] self.logger.info('Users to process: %s' % str(users)) except: self.logger.exception('User data malformed. ') for user in users: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquireTransfers' fileDoc['username'] = user[0] self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user) try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return users
def getPublDescFiles(self, workflow, lfn_ready, logger): """ Download and read the files describing what needs to be published """ dataDict = {} dataDict['taskname'] = workflow dataDict['filetype'] = 'EDM' out = [] # divide lfn per chunks, avoiding URI-too long exception def chunks(l, n): """ Yield successive n-sized chunks from l. :param l: list to splitt in chunks :param n: chunk size :return: yield the next list chunk """ for i in range(0, len(l), n): yield l[i:i + n] chunkSize = 10 nIter = 0 if len(lfn_ready) > chunkSize: logger.info( "retrieving input file metadata for %s files in chunks of %s", len(lfn_ready), chunkSize) for lfn_ in chunks(lfn_ready, chunkSize): nIter += 1 dataDict['lfn'] = lfn_ data = encodeRequest(dataDict, listParams=["lfn"]) uri = self.REST_filemetadata try: #res = self.crabServer.get(uri=uri, data=encodeRequest(data, listParams=["lfn"])) res = self.crabServer.get(uri=uri, data=data) res = res[0] except Exception as ex: logger.error("Error during metadata retrieving from %s: %s", uri, ex) continue # print(len(res['result'])) for obj in res['result']: if isinstance(obj, dict): out.append(obj) else: # print type(obj) out.append(json.loads(str(obj))) if nIter % 10 == 0: logger.info("... retrieved %s metadata", len(out)) return out
def getData(subresource): """This function will fetch data from Oracle table""" crabserver = CRABRest(hostname=CMSWEB, localcert=CERTIFICATE, localkey=KEY, retry=3, userAgent='CRABTaskWorker') crabserver.setDbInstance(dbInstance=DBINSTANCE) result = crabserver.get(api='filetransfers', data=encodeRequest({ 'subresource': subresource, 'grouping': 0 })) return oracleOutputMapping(result)
def lastPubTime(self, workflow): """ :param workflow: :return: """ data = dict() data['workflow'] = workflow data['subresource'] = 'updatepublicationtime' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) self.logger.debug("%s last publication type update: %s " % (workflow, str(result))) except Exception: msg = "Error updating last publication time" self.logger.exception(msg)
def acquirePub(self): """ :return: """ fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquirePublication' self.logger.debug("Retrieving publications from oracleDB") try: self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" % ex)
def searchTask(self, workflow): """ :param workflow: :return: """ data = dict() data['workflow'] = workflow data['subresource'] = 'search' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) self.logger.debug("task: %s " % str(result[0])) self.logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication')) except Exception as ex: self.logger.error("Error during task doc retrieving: %s" % ex) return {} return oracleOutputMapping(result)
def source_destinations_by_user(self): """ Get all the destinations for a user """ if self.config.isOracle: self.logger.debug('Running acquiredTransfers query... ' + self.user) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 1 fileDoc['username'] = self.user group = self.group if self.group == '': group = None if self.role == '': role = None fileDoc['vogroup'] = group fileDoc['vorole'] = role fileDoc['limit'] = self.config.max_files_per_transfer result = [] self.logger.debug('Request: ' + str(fileDoc)) try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result = oracleOutputMapping(results) res = [[x['source'], x['destination']] for x in result] res.sort() res = list(k for k, _ in itertools.groupby(res)) except Exception as ex: self.logger.error("Failed to get acquired transfers \ from oracleDB: %s" %ex) return [], {} return res, result else: query = {'group': True, 'startkey':[self.user, self.group, self.role], 'endkey':[self.user, self.group, self.role, {}, {}]} try: sites = self.db.loadView(self.config.ftscp_design, 'ftscp_all', query) except: return [] return [[x[4], x[3]] for x in sites['rows']]
def retry(self): """ Retry documents older than self.config.cooloffTime :return: """ fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.config.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) results = dict() try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") self.logger.info("Retried files in cooloff: %s" % str(results)) return 0
def algorithm(self, parameters=None): """ Performs the doRetries method, loading the appropriate plugin for each job and handling it. """ logging.debug("Running retryManager algorithm") if self.config.isOracle: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'retryTransfers' fileDoc['time_to'] = self.cooloffTime self.logger.debug('fileDoc: %s' % fileDoc) try: results = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception: self.logger.exception("Failed to get retry transfers in oracleDB: %s") logging.info("Retried files in cooloff: %s" % str(results)) else: self.doRetries()
def submitted(self, files): """ Mark the list of files as submitted once the FTS submission succeeded ACQUIRED -> SUBMITTED Return the lfns updated successfully and report data for dashboard :param files: tuple (source_lfn, dest_lfn) :return: """ lfn_in_transfer = [] dash_rep = () id_list = list() docId = '' for lfn in files: lfn = lfn[0] if lfn.find('temp') == 7: self.logger.debug("Marking acquired %s" % lfn) docId = getHashLfn(lfn) self.logger.debug("Marking acquired %s" % docId) try: id_list.append(docId) lfn_in_transfer.append(lfn) except Exception as ex: self.logger.error("Error getting id: %s" % ex) raise lfn_in_transfer.append(lfn) # TODO: add dashboard stuff # dash_rep = (document['jobid'], document['job_retry_count'], document['taskname']) try: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = id_list fileDoc['list_of_transfer_state'] = ["SUBMITTED" for x in id_list] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("Marked acquired %s" % (id_list)) except Exception as ex: self.logger.error("Error during status update: %s" % ex) return lfn_in_transfer, dash_rep
def getPublDescFiles(self, workflow, lfn_ready): """ Download and read the files describing what needs to be published """ data = {} data['taskname'] = workflow data['filetype'] = 'EDM' out = [] # divide lfn per chunks, avoiding URI-too long exception def chunks(l, n): """ Yield successive n-sized chunks from l. :param l: list to splitt in chunks :param n: chunk size :return: yield the next list chunk """ for i in range(0, len(l), n): yield l[i:i + n] for lfn_ in chunks(lfn_ready, 50): data['lfn'] = lfn_ try: res = self.oracleDB.get('/crabserver/preprod/filemetadata', data=encodeRequest(data, listParams=["lfn"])) res = res[0] except Exception as ex: self.logger.error("Error during metadata retrieving: %s" %ex) continue print(len(res['result'])) for obj in res['result']: if isinstance(obj, dict): out.append(obj) else: #print type(obj) out.append(json.loads(str(obj))) return out
def getPub(self): """ :return: """ to_pub_docs = list() filedoc = dict() filedoc['asoworker'] = self.config.asoworker filedoc['subresource'] = 'acquiredPublication' filedoc['grouping'] = 0 try: results = self.oracleDB.get(self.config.oracleFileTrans, data=encodeRequest(filedoc)) to_pub_docs = oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to get acquired publications \ from oracleDB: %s" % ex) return to_pub_docs return to_pub_docs
def update_FTSJobID(self, jobReport): """ """ for job in jobReport: try: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = [getHashLfn(x) for x in job['LFNs']] fileDoc['list_of_transfer_state'] = ["SUBMITTED" for x in job['LFNs']] fileDoc['list_of_fts_instance'] = [self.fts_server_for_transfer for x in job['LFNs']] fileDoc['list_of_fts_id'] = [ job['FTSJobid'] for x in job['LFNs'] ] self.logger.debug("Marking submitted %s files " % (len(fileDoc['list_of_ids']))) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) self.logger.debug("Marked submitted %s" % (fileDoc['list_of_ids'])) except Exception as ex: self.logger.error("Error during status update: %s" %ex) time.sleep(10) return False return True
def mark_transferred(ids, oracleDB): """ Mark the list of files as tranferred :param ids: list of Oracle file ids to update :return: 0 success, 1 failure """ os.environ["X509_CERT_DIR"] = os.getcwd() already_list = [] if os.path.exists("task_process/transfers/transferred_files.txt"): with open("task_process/transfers/transferred_files.txt", "r") as list_file: for _data in list_file.readlines(): already_list.append(_data.split("\n")[0]) ids = [x for x in ids if x not in already_list] if len(ids) > 0: try: logging.debug("Marking done %s", ids) data = dict() data['asoworker'] = 'rucio' data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["DONE" for _ in ids] oracleDB.post('/filetransfers', data=encodeRequest(data)) logging.info("Marked good %s", ids) with open("task_process/transfers/transferred_files.txt", "a+") as list_file: for id_ in ids: list_file.write("%s\n" % id_) except Exception: logging.exception("Error updating documents") return 1 else: logging.info("Nothing to update (Done)") return 0
def mark_transferred(ids, crabserver): """ Mark the list of files as tranferred :param ids: list of Oracle file ids to update :param crabserver: a CRABRest object for doing POST to CRAB server REST :return: True success, False failure """ try: logging.debug("Marking done %s", ids) data = dict() data['asoworker'] = asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["DONE" for _ in ids] crabserver.post('/filetransfers', data=encodeRequest(data)) logging.info("Marked good %s", ids) except Exception: logging.exception("Error updating documents") return False return True
def mark_failed(ids, failures_reasons, crabserver): """ Mark the list of files as failed :param ids: list of Oracle file ids to update :param failures_reasons: list of strings with transfer failure messages :param crabserver: an CRABRest object for doing POST to CRAB server REST :return: True success, False failure """ try: data = dict() data['asoworker'] = asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = ids data['list_of_transfer_state'] = ["FAILED" for _ in ids] data['list_of_failure_reason'] = failures_reasons data['list_of_retry_value'] = [0 for _ in ids] crabserver.post('/filetransfers', data=encodeRequest(data)) logging.info("Marked failed %s", ids) except Exception: logging.exception("Error updating documents") return False return True
def publishInDBS3(taskname): """ """ def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) createLogdir('taskLogs') logger = logging.getLogger(taskname) logging.basicConfig(filename='taskLogs/'+taskname+'.log', level=logging.INFO, format=config.General.logMsgFormat) logger.info("Getting files to publish") toPublish = [] # TODO move from new to done when processed with open("/tmp/"+taskname+".json") as f: toPublish = json.load(f) workflow = taskname if len(toPublish) == 0: return "EMPTY" if not workflow: logger.info("NO TASKNAME: %s" % toPublish[0]) for k, v in toPublish[0].iteritems(): if k == 'taskname': logger.info("Starting: %s: %s" % (k, v)) wfnamemsg = "%s: " % (workflow) user = toPublish[0]["User"] try: group = toPublish[0]["Group"] role = toPublish[0]["Role"] except: group = "" role = "" if not group or group in ['null']: group = "" if not role or role in ['null']: role = "" userDN = toPublish[0]["UserDN"] pnn = toPublish[0]["Destination"] logger.info(wfnamemsg+" "+user) READ_PATH = "/DBSReader" READ_PATH_1 = "/DBSReader/" # TODO: get user role and group try: proxy = Proxy(userDN, group, role, logger) except: logger.exception("Failed to retrieve user proxy") return "FAILED" oracelInstance = config.General.oracleDB oracleDB = HTTPRequests(oracelInstance, proxy, proxy) fileDoc = dict() fileDoc['subresource'] = 'search' fileDoc['workflow'] = workflow try: results = oracleDB.get(task_path, data=encodeRequest(fileDoc)) except Exception as ex: logger.error("Failed to get acquired publications from oracleDB for %s: %s" % (workflow, ex)) return "FAILED" logger.info(results[0]['desc']['columns']) try: inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset") inputDataset = results[0]['result'][inputDatasetIndex] sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url") sourceURL = results[0]['result'][sourceURLIndex] publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url") publish_dbs_url = results[0]['result'][publish_dbs_urlIndex] #sourceURL = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" if not sourceURL.endswith(READ_PATH) and not sourceURL.endswith(READ_PATH_1): sourceURL += READ_PATH except Exception: logger.exception("ERROR") ## When looking up parents may need to look in global DBS as well. globalURL = sourceURL globalURL = globalURL.replace('phys01', 'global') globalURL = globalURL.replace('phys02', 'global') globalURL = globalURL.replace('phys03', 'global') globalURL = globalURL.replace('caf', 'global') pr = os.environ.get("SOCKS5_PROXY") logger.info(wfnamemsg+"Source API URL: %s" % sourceURL) sourceApi = dbsClient.DbsApi(url=sourceURL, proxy=pr) logger.info(wfnamemsg+"Global API URL: %s" % globalURL) globalApi = dbsClient.DbsApi(url=globalURL, proxy=pr) WRITE_PATH = "/DBSWriter" MIGRATE_PATH = "/DBSMigrate" READ_PATH = "/DBSReader" if publish_dbs_url.endswith(WRITE_PATH): publish_read_url = publish_dbs_url[:-len(WRITE_PATH)] + READ_PATH publish_migrate_url = publish_dbs_url[:-len(WRITE_PATH)] + MIGRATE_PATH else: publish_migrate_url = publish_dbs_url + MIGRATE_PATH publish_read_url = publish_dbs_url + READ_PATH publish_dbs_url += WRITE_PATH try: logger.debug(wfnamemsg+"Destination API URL: %s" % publish_dbs_url) destApi = dbsClient.DbsApi(url=publish_dbs_url, proxy=pr) logger.debug(wfnamemsg+"Destination read API URL: %s" % publish_read_url) destReadApi = dbsClient.DbsApi(url=publish_read_url, proxy=pr) logger.debug(wfnamemsg+"Migration API URL: %s" % publish_migrate_url) migrateApi = dbsClient.DbsApi(url=publish_migrate_url, proxy=pr) except: logger.exception('Wrong DBS URL %s' % publish_dbs_url) return "FAILED" logger.info("inputDataset: %s" % inputDataset) noInput = len(inputDataset.split("/")) <= 3 # TODO: fix dbs dep if not noInput: try: existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*') primary_ds_type = existing_datasets[0]['primary_ds_type'] # There's little chance this is correct, but it's our best guess for now. # CRAB2 uses 'crab2_tag' for all cases existing_output = destReadApi.listOutputConfigs(dataset=inputDataset) except: logger.exception('Wrong DBS URL %s' % publish_dbs_url) return "FAILED" if not existing_output: msg = "Unable to list output config for input dataset %s." % (inputDataset) logger.error(wfnamemsg+msg) global_tag = 'crab3_tag' else: global_tag = existing_output[0]['global_tag'] else: msg = "This publication appears to be for private MC." logger.info(wfnamemsg+msg) primary_ds_type = 'mc' global_tag = 'crab3_tag' acquisition_era_name = "CRAB" processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'} appName = 'cmsRun' appVer = toPublish[0]["swversion"] pset_hash = toPublish[0]['publishname'].split("-")[-1] gtag = str(toPublish[0]['globaltag']) if gtag == "None": gtag = global_tag try: if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]: acquisitionera = str(toPublish[0]['acquisitionera']) else: acquisitionera = acquisition_era_name except: acquisitionera = acquisition_era_name _, primName, procName, tier = toPublish[0]['outdataset'].split('/') primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type} msg = "About to insert primary dataset: %s" % (str(primds_config)) logger.debug(wfnamemsg+msg) destApi.insertPrimaryDataset(primds_config) msg = "Successfully inserted primary dataset %s." % (primName) logger.debug(wfnamemsg+msg) final = {} failed = [] publish_in_next_iteration = [] published = [] dataset = toPublish[0]['outdataset'] # Find all (valid) files already published in this dataset. try: existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True) existingFiles = [f['logical_file_name'] for f in existingDBSFiles] existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']] msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles)) msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid)) logger.info(wfnamemsg+msg) final['existingFiles'] = len(existingFiles) except Exception as ex: msg = "Error when listing files in DBS: %s" % (str(ex)) msg += "\n%s" % (str(traceback.format_exc())) logger.error(wfnamemsg+msg) return "FAILED" # check if actions are needed workToDo = False for fileTo in toPublish: if fileTo['lfn'] not in existingFilesValid: workToDo = True if not workToDo: msg = "Nothing uploaded, %s has these files already or not enough files." % (dataset) logger.info(wfnamemsg+msg) return "NOTHING TO DO" acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0} output_config = {'release_version': appVer, 'pset_hash': pset_hash, 'app_name': appName, 'output_module_label': 'o', 'global_tag': global_tag, } msg = "Published output config." logger.debug(wfnamemsg+msg) dataset_config = {'dataset': dataset, 'processed_ds_name': procName, 'data_tier_name': tier, 'acquisition_era_name': acquisitionera, 'dataset_access_type': 'VALID', 'physics_group_name': 'CRAB3', 'last_modification_date': int(time.time()), } msg = "About to insert dataset: %s" % (str(dataset_config)) logger.info(wfnamemsg+msg) del dataset_config['acquisition_era_name'] # List of all files that must (and can) be published. dbsFiles = [] dbsFiles_f = [] # Set of all the parent files from all the files requested to be published. parentFiles = set() # Set of parent files for which the migration to the destination DBS instance # should be skipped (because they were not found in DBS). parentsToSkip = set() # Set of parent files to migrate from the source DBS instance # to the destination DBS instance. localParentBlocks = set() # Set of parent files to migrate from the global DBS instance # to the destination DBS instance. globalParentBlocks = set() # Loop over all files to publish. for file_ in toPublish: logger.info(file_) # Check if this file was already published and if it is valid. if file_['lfn'] not in existingFilesValid: # We have a file to publish. # Get the parent files and for each parent file do the following: # 1) Add it to the list of parent files. # 2) Find the block to which it belongs and insert that block name in # (one of) the set of blocks to be migrated to the destination DBS. for parentFile in list(file_['parents']): if parentFile not in parentFiles: parentFiles.add(parentFile) # Is this parent file already in the destination DBS instance? # (If yes, then we don't have to migrate this block.) blocksDict = destReadApi.listBlocks(logical_file_name=parentFile) if not blocksDict: # No, this parent file is not in the destination DBS instance. # Maybe it is in the same DBS instance as the input dataset? blocksDict = sourceApi.listBlocks(logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in the same DBS instance as the input dataset. # Add the corresponding block to the set of blocks from the source DBS # instance that have to be migrated to the destination DBS. localParentBlocks.add(blocksDict[0]['block_name']) else: # No, this parent file is not in the same DBS instance as input dataset. # Maybe it is in global DBS instance? blocksDict = globalApi.listBlocks(logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in global DBS instance. # Add the corresponding block to the set of blocks from global DBS # instance that have to be migrated to the destination DBS. globalParentBlocks.add(blocksDict[0]['block_name']) # If this parent file is not in the destination DBS instance, is not # the source DBS instance, and is not in global DBS instance, then it # means it is not known to DBS and therefore we can not migrate it. # Put it in the set of parent files for which migration should be skipped. if not blocksDict: parentsToSkip.add(parentFile) # If this parent file should not be migrated because it is not known to DBS, # we remove it from the list of parents in the file-to-publish info dictionary # (so that when publishing, this "parent" file will not appear as a parent). if parentFile in parentsToSkip: msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile) logger.info(wfnamemsg+msg) if parentFile in file_['parents']: file_['parents'].remove(parentFile) # Add this file to the list of files to be published. dbsFiles.append(format_file_3(file_)) dbsFiles_f.append(file_) #print file published.append(file_['SourceLFN']) # Print a message with the number of files to publish. msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles)) logger.info(wfnamemsg+msg) # If there are no files to publish, continue with the next dataset. if len(dbsFiles_f) == 0: msg = "Nothing to do for this dataset." logger.info(wfnamemsg+msg) return "NOTHING TO DO" # Migrate parent blocks before publishing. # First migrate the parent blocks that are in the same DBS instance # as the input dataset. if localParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks) logger.info(wfnamemsg+msg) statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, sourceApi, inputDataset, localParentBlocks ) if statusCode: failureMsg += " Not publishing any files." logger.info(wfnamemsg+failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) failure_reason = failureMsg published = [x for x in published[dataset] if x not in failed[dataset]] return "NOTHING TO DO" # Then migrate the parent blocks that are in the global DBS instance. if globalParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks) logger.info(wfnamemsg+msg) statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks) if statusCode: failureMsg += " Not publishing any files." logger.info(wfnamemsg+failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) failure_reason = failureMsg published = [x for x in published[dataset] if x not in failed[dataset]] return "NOTHING TO DO" # Publish the files in blocks. The blocks must have exactly max_files_per_block # files, unless there are less than max_files_per_block files to publish to # begin with. If there are more than max_files_per_block files to publish, # publish as many blocks as possible and leave the tail of files for the next # PublisherWorker call, unless forced to published. block_count = 0 count = 0 max_files_per_block = config.General.max_files_per_block while True: block_name = "%s#%s" % (dataset, str(uuid.uuid4())) files_to_publish = dbsFiles[count:count+max_files_per_block] try: block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0} msg = "Inserting files %s into block %s." % ([f['logical_file_name'] for f in files_to_publish], block_name) logger.info(wfnamemsg+msg) blockDump = createBulkBlock(output_config, processing_era_config, primds_config, dataset_config, acquisition_era_config, block_config, files_to_publish) #logger.debug(wfnamemsg+"Block to insert: %s\n %s" % (blockDump, destApi.__dict__ )) destApi.insertBulkBlock(blockDump) block_count += 1 except Exception as ex: logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish]) failed.extend([f['SourceLFN'] for f in toPublish]) msg = "Error when publishing (%s) " % ", ".join(failed) msg += str(ex) msg += str(traceback.format_exc()) logger.error(wfnamemsg+msg) failure_reason = str(ex) count += max_files_per_block files_to_publish_next = dbsFiles_f[count:count+max_files_per_block] if len(files_to_publish_next) < max_files_per_block: publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next]) break published = [x for x in published if x not in failed + publish_in_next_iteration] # Fill number of files/blocks published for this dataset. final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration) final['blocks'] = block_count # Print a publication status summary for this dataset. msg = "End of publication status for dataset %s:" % (dataset) msg += " failed (%s) %s" % (len(failed), failed) msg += ", published (%s) %s" % (len(published), published) msg += ", publish_in_next_iteration (%s) %s" % (len(publish_in_next_iteration), publish_in_next_iteration) msg += ", results %s" % (final) logger.info(wfnamemsg+msg) try: if published: mark_good(workflow, published, oracleDB, logger) if failed: logger.debug("Failed files: %s " % failed) mark_failed(failed, oracleDB, logger, failure_reason) except: logger.exception("Status update failed") return 0
def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([ random.choice(string.ascii_lowercase) for _ in range(20) ]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = { 'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles } if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': query, 'username': username, 'taskname': taskname })) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get( '/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getById', 'id': key })) randomUsers = random.sample( set(self.users), 3 ) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfers', 'username': username, 'taskname': taskname })) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname'] })) resultOut = oracleOutputMapping(result, None) print("**" * 50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({ 'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id'] })) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post( '/crabserver/dev/fileusertransfers', data=encodeRequest( { 'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds'] }, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])
def submit(phedex, ftsContext, toTrans): """ submit tranfer jobs - group files to be transferred by source site - prepare jobs chunks of max 200 transfers - submit fts job :param ftsContext: fts client ftsContext :param toTrans: [source pfn, destination pfn, oracle file id, source site] :return: list of jobids submitted """ threadLock = threading.Lock() threads = [] jobids = [] to_update = [] oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) sources = list(set([x[3] for x in toTrans])) for source in sources: ids = [x[2] for x in toTrans if x[3] == source] username = toTrans[0][5] taskname = toTrans[0][6] src_lfns = [x[0] for x in toTrans if x[3] == source] dst_lfns = [x[1] for x in toTrans if x[3] == source] sorted_source_pfns = [] sorted_dest_pfns = [] try: for chunk in chunks(src_lfns, 10): unsorted_source_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(source, chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_source_pfns: if order_lfn == lfn: sorted_source_pfns.append(pfn) break for chunk in chunks(dst_lfns, 10): unsorted_dest_pfns = [[k[1], str(x)] for k, x in phedex.getPFN(toTrans[0][4], chunk).items()] for order_lfn in chunk: for lfn, pfn in unsorted_dest_pfns: if order_lfn == lfn: sorted_dest_pfns.append(pfn) break except Exception as ex: logging.error("Failed to map lfns to pfns: %s", ex) mark_failed(ids, ["Failed to map lfn to pfn: " + str(ex) for _ in ids]) source_pfns = sorted_source_pfns dest_pfns = sorted_dest_pfns tx_from_source = [[x[0], x[1], x[2], source, username, taskname] for x in zip(source_pfns, dest_pfns, ids)] for files in chunks(tx_from_source, 200): thread = submit_thread(threadLock, logging, ftsContext, files, source, jobids, to_update) thread.start() threads.append(thread) for t in threads: t.join() for fileDoc in to_update: _ = oracleDB.post('/filetransfers', data=encodeRequest(fileDoc)) logging.info("Marked submitted %s files", fileDoc['list_of_ids']) return jobids
from __future__ import division from RESTInteractions import HTTPRequests from ServerUtilities import encodeRequest, oracleOutputMapping server = HTTPRequests('cmsweb-testbed.cern.ch', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy') fileDoc = {} fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'acquireTransfers' result = server.post('/crabserver/dev/filetransfers', data=encodeRequest(fileDoc)) print(result) """ fileDoc = {} fileDoc['asoworker'] = 'asodciangot1' fileDoc['subresource'] = 'acquiredTransfers' fileDoc['grouping'] = 0 result = server.get('/crabserver/dev/filetransfers', data=encodeRequest(fileDoc)) #print(oracleOutputMapping(result)) ids = [str(x['id']) for x in oracleOutputMapping(result)]
def oracleSiteUser(self, db): """ 1. Acquire transfers from DB 2. Get acquired users and destination sites """ self.logger.info('Retrieving users...') fileDoc = dict() fileDoc['subresource'] = 'activeUsers' fileDoc['grouping'] = 0 fileDoc['asoworker'] = self.config.asoworker result = dict() try: result = db.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" % ex) return [] self.logger.debug(oracleOutputMapping(result)) # TODO: translate result into list((user,group,role),...) if len(oracleOutputMapping(result)) != 0: self.logger.debug(type( [[x['username'].encode('ascii','ignore'), x['user_group'], x['user_role']] for x in oracleOutputMapping(result)])) try: docs = oracleOutputMapping(result) users = [[x['username'], x['user_group'], x['user_role']] for x in docs] self.logger.info('Users to process: %s' % str(users)) except: self.logger.exception('User data malformed. ') else: self.logger.info('No new user to acquire') return [] actives = list() for user in users: fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquireTransfers' fileDoc['username'] = user[0] self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user[0]) try: result = db.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire transfers \ from oracleDB: %s" %ex) continue self.doc_acq = str(result) for i in range(len(user)): if not user[i] or user[i] in ['None', 'NULL']: user[i] = '' user[i] = str(user[i]) actives.append(user) self.logger.debug("Transfers retrieved from oracleDB. %s " % users) return users
def monitor(user, taskname, log): """ function monitoring the Rucio replica locks of a rule and updating db statuses accordingly :param user: user HN name :type user: str :param taskname: CRAB taskname :type taskname: str :param log: log object :type log: logging """ os.environ["X509_CERT_DIR"] = os.getcwd() proxy = None if os.path.exists('task_process/rest_filetransfers.txt'): with open("task_process/rest_filetransfers.txt", "r") as _rest: rest_filetransfers = _rest.readline().split('\n')[0] proxy = os.getcwd() + "/" + _rest.readline() log.info("Proxy: %s", proxy) os.environ["X509_USER_PROXY"] = proxy if not proxy: log.info('No proxy available yet - waiting for first post-job') return None # Prepare user and task info for monitoring scope = "user." + user name = taskname log.info("Initializing Monitor Rucio client for %s", taskname) crabInj = CRABDataInjector("", "", scope=scope, account=user, auth_type='x509_proxy') id_map = {} lfn_map = {} source_rse = {} # create maps for lfn --> oracle id, source rse if os.path.exists('task_process/transfers.txt'): with open('task_process/transfers.txt', 'r') as _list: for _data in _list.readlines(): try: doc = json.loads(_data) id_map.update({doc['destination_lfn']: doc['id']}) lfn_map.update({doc['id']: doc['destination_lfn']}) source_rse.update( {doc['destination_lfn']: doc['source'] + "_Temp"}) except Exception: continue if os.path.exists('task_process/transfers_direct.txt'): with open('task_process/transfers_direct.txt', 'r') as _list: for _data in _list.readlines(): try: doc = json.loads(_data) id_map.update({doc['destination_lfn']: doc['id']}) lfn_map.update({doc['id']: doc['destination_lfn']}) except Exception: continue # get the rule for this rucio dataset try: rules_ = crabInj.cli.list_did_rules(scope, name) # {u'name': u'/store/user/dciangot/DStarToD0Pi_D0KPi_DStarFilter_TuneCP5_13TeV-pythia8-evtgen/crab_DStar_rucio_rucio_198_7/190129_085050/0000/DS2b_17_1.root', u'rse': u'T2_IT_Pisa', u'state': u'OK', u'scope': u'user.dciangot', u'rse_id': u'200b6830ca424d87a2e0ae855341b084', u'rule_id': u'4bc56a77ac6743e791dfedaa11db1e1c'} list_good = [] list_failed = [] list_failed_tmp = [] list_stuck = [] list_update = [] rules = next(rules_) log.debug("RULES %s", rules) except Exception: log.exception("Failed to retrieve rule information") return locks_generator = None # get replica locks and monitor status try: locks_generator = crabInj.cli.list_replica_locks(rules['id']) except Exception: if rules['state'] == 'STUCK': transfers = crabInj.cli.examine_replication_rule( rules['id'])['transfers'] for lfn in transfers: list_stuck.append((lfn['name'], 'Rule STUCK.')) else: log.exception('Unable to get replica locks') return # analyze replica locks info for each file sitename = None # TODO: should we split in threads ? for file_ in locks_generator: log.debug("LOCK %s", file_) filename = file_['name'] status = file_['state'] log.info("state %s", status) sitename = file_['rse'] if status == "OK": list_good.append(filename) if status == "STUCK": list_failed_tmp.append((filename, "Transfer Stuck", sitename)) if status == "REPLICATING": try: ftsJobID = crabInj.cli.list_request_by_did( filename, sitename, scope)["external_id"] if ftsJobID: list_update.append((filename, ftsJobID)) except Exception: log.exception("Replica lock not found") # Expose FTS job ID in case of failure (if available) for name_ in [x[0] for x in list_failed_tmp]: try: ftsJobID = crabInj.cli.list_request_by_did(name_, sitename, scope)["external_id"] if ftsJobID: list_failed.append((name_, "FTS job ID: %s" % ftsJobID)) else: list_failed.append(( name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job." )) except Exception: log.error( "No FTS job ID available for stuck transfer %s. Rucio could have failed to submit FTS job." % name_) list_failed.append(( name_, "No FTS job ID available for stuck transfers. Rucio could have failed to submit FTS job." )) # Filter out files already staged directly from the wn direct_files = [] if os.path.exists('task_process/transfers/registered_direct_files.txt'): with open("task_process/transfers/registered_direct_files.txt", "r") as list_file: direct_files = [x.split('\n')[0] for x in list_file.readlines()] log.debug( "Checking if some failed files were directly staged from wn: {0}" .format(str(direct_files))) list_failed = [x for x in list_failed if x[0] not in direct_files] log.debug("{0} files to be marked as failed.".format( str(len(list_failed)))) try: oracleDB = HTTPRequests(rest_filetransfers, proxy, proxy) except Exception: log.exception("Failed to set connection to oracleDB") return # Mark FAILED files on the DB and remove them from dataset and rucio replicas try: if len(list_failed) > 0: list_failed_name = [{ 'scope': scope, 'name': x[0] } for x in list_failed] log.debug("Detaching %s" % list_failed_name) crabInj.cli.detach_dids(scope, name, list_failed_name) sources = list( set([source_rse[x['name']] for x in list_failed_name])) for source in sources: to_delete = [ x for x in list_failed_name if source_rse[x['name']] == source ] log.debug("Deleting %s from %s" % (to_delete, source)) crabInj.delete_replicas(source, to_delete) mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except ReplicaNotFound: try: mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except Exception: log.exception("Failed to update status for failed files") except Exception: log.exception("Failed to update status for failed files") # Mark files of STUCK rules on the DB and remove them from dataset and rucio replicas try: if len(list_stuck) > 0: list_stuck_name = [{ 'scope': scope, 'name': x[0] } for x in list_stuck] log.debug("Detaching %s" % list_stuck_name) crabInj.cli.detach_dids(scope, name, list_stuck_name) sources = list( set([source_rse[x['name']] for x in list_stuck_name])) for source in sources: to_delete = [ x for x in list_stuck_name if source_rse[x['name']] == source ] log.debug("Deleting %s from %s" % (to_delete, source)) crabInj.delete_replicas(source, to_delete) mark_failed([id_map[x[0]] for x in list_stuck], [x[1] for x in list_stuck], oracleDB) except ReplicaNotFound: try: mark_failed([id_map[x[0]] for x in list_failed], [x[1] for x in list_failed], oracleDB) except Exception: log.exception("Failed to update status for failed files") except Exception: log.exception("Failed to update status for stuck rule") # Mark successful transfers as done on oracle DB try: mark_transferred([id_map[x] for x in list_good], oracleDB) except Exception: log.exception("Failed to update status for transferred files") try: already_list = [] list_update_filt = [] # Keep track of what has been already marked. Avoiding double updates at next iteration if os.path.exists("task_process/transfers/submitted_files.txt"): with open("task_process/transfers/submitted_files.txt", "r") as list_file: for _data in list_file.readlines(): already_list.append(_data.split("\n")[0]) list_update_filt = [ x for x in list_update if x not in already_list and x[0] not in direct_files ] # Insert FTS job ID in oracle DB for all the available tranfers if len(list_update_filt) > 0: list_update = list_update_filt fileDoc = dict() fileDoc['asoworker'] = 'rucio' fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = [id_map[x[0]] for x in list_update] fileDoc['list_of_transfer_state'] = [ "SUBMITTED" for _ in list_update ] fileDoc['list_of_fts_instance'] = [ 'https://fts3.cern.ch:8446/' for _ in list_update ] fileDoc['list_of_fts_id'] = [x[1] for x in list_update] oracleDB.post('/filetransfers', data=encodeRequest(fileDoc)) log.debug("Marked submitted %s" % [id_map[x[0]] for x in list_update]) with open("task_process/transfers/submitted_files.txt", "a+") as list_file: for update in list_update: log.debug("{0}\n".format(str(update))) list_file.write("{0}\n".format(str(update))) else: log.info("Nothing to update (fts job ID)") except Exception: log.exception('Failed to update file status for FTSJobID inclusion.')
def mark_failed(self, files=[], failures_reasons=[], force_fail=False): """ Something failed for these files so increment the retry count """ updated_lfn = [] for lfn in files: data = {} self.logger.debug("Document: %s" % lfn) if not isinstance(lfn, dict): if 'temp' not in lfn: temp_lfn = lfn.replace('store', 'store/temp', 1) else: temp_lfn = lfn else: if 'temp' not in lfn['value']: temp_lfn = lfn['value'].replace('store', 'store/temp', 1) else: temp_lfn = lfn['value'] docId = getHashLfn(temp_lfn) # Load document to get the retry_count if self.config.isOracle: try: self.logger.debug("Document: %s" %docId) docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId)[0] data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = docId if force_fail or document['transfer_retry_count'] + 1 > self.max_retry: data['list_of_transfer_state'] = 'FAILED' data['list_of_retry_value'] = 0 else: data['list_of_transfer_state'] = 'RETRY' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['list_of_transfer_state'] = 'FAILED' data['list_of_failure_reason'] = failures_reasons[files.index(lfn)] data['list_of_retry_value'] = 0 self.logger.debug("update: %s" % data) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) if not data['list_of_transfer_state'] == 'RETRY': updated_lfn.append(lfn) self.logger.debug("Marked failed %s" % lfn) except Exception as ex: self.logger.error("Error updating document status: %s" %ex) continue else: try: document = self.db.document( docId ) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': now = str(datetime.datetime.now()) last_update = time.time() # Prepare data to update the document in couch if force_fail or len(document['retry_count']) + 1 > self.max_retry: data['state'] = 'failed' data['end_time'] = now else: data['state'] = 'retry' fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)]) if fatal_error: data['state'] = 'failed' data['end_time'] = now self.logger.debug("Failure list: %s" % failures_reasons) self.logger.debug("Files: %s" % files) self.logger.debug("LFN %s" % lfn) data['failure_reason'] = failures_reasons[files.index(lfn)] data['last_update'] = last_update data['retry'] = now # Update the document in couch self.logger.debug("Marking failed %s" % docId) try: updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(docId) self.logger.debug("Marked failed %s" % docId) except Exception as ex: msg = "Error in updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue else: updated_lfn.append(docId) self.logger.debug("failed file updated") return updated_lfn
def active_users(self, db): """ Query a view for users with files to transfer. Get this from the following view: publish?group=true&group_level=1 """ if self.config.isOracle: active_users = [] fileDoc = {} fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquirePublication' self.logger.debug("Retrieving publications from oracleDB") results = '' try: results = db.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" %ex) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'acquiredPublication' fileDoc['grouping'] = 0 self.logger.debug("Retrieving acquired puclications from oracleDB") try: results = db.get(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) result = oracleOutputMapping(results) except Exception as ex: self.logger.error("Failed to acquire publications \ from oracleDB: %s" %ex) self.logger.debug("%s acquired puclications retrieved" % len(result)) #TODO: join query for publisher (same of submitter) unique_users = [list(i) for i in set(tuple([x['username'], x['user_group'], x['user_role']]) for x in result if x['transfer_state'] == 3)] return unique_users else: # TODO: Remove stale=ok for now until tested # query = {'group': True, 'group_level': 3, 'stale': 'ok'} query = {'group': True, 'group_level': 3} try: users = db.loadView('DBSPublisher', 'publish', query) except Exception as e: self.logger.exception('A problem occured \ when contacting couchDB: %s' % e) return [] if len(users['rows']) <= self.config.publication_pool_size: active_users = users['rows'] active_users = [x['key'] for x in active_users] else: pool_size=self.config.publication_pool_size sorted_users = self.factory.loadObject(self.config.algoName, args=[self.config, self.logger, users['rows'], pool_size], getFromCache=False, listFlag = True) active_users = sorted_users()[:self.config.publication_pool_size] self.logger.info('%s active users' % len(active_users)) self.logger.debug('Active users are: %s' % active_users) return active_users
def publishInDBS3(taskname): """ """ def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.mkdir(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print("The task worker need to access the '%s' directory" % dirname) sys.exit(1) createLogdir('taskLogs') logger = logging.getLogger(taskname) logging.basicConfig(filename='taskLogs/'+taskname+'.log', level=logging.INFO, format=config.General.logMsgFormat) logger.info("Getting files to publish") toPublish = [] # TODO move from new to done when processed with open("/tmp/publisher_files/"+taskname+".json") as f: toPublish = json.load(f) workflow = taskname if len(toPublish) == 0: return "EMPTY" if not workflow: logger.info("NO TASKNAME: %s" % toPublish[0]) for k, v in toPublish[0].iteritems(): if k == 'taskname': logger.info("Starting: %s: %s" % (k, v)) wfnamemsg = "%s: " % (workflow) user = toPublish[0]["User"] try: group = toPublish[0]["Group"] role = toPublish[0]["Role"] except: group = "" role = "" if not group or group in ['null']: group = "" if not role or role in ['null']: role = "" userDN = toPublish[0]["UserDN"] pnn = toPublish[0]["Destination"] logger.info(wfnamemsg+" "+user) READ_PATH = "/DBSReader" READ_PATH_1 = "/DBSReader/" # TODO: get user role and group try: proxy = Proxy(userDN, group, role, logger) except: logger.exception("Failed to retrieve user proxy") return "FAILED" oracelInstance = config.General.oracleDB oracleDB = HTTPRequests(oracelInstance, proxy, proxy) fileDoc = dict() fileDoc['subresource'] = 'search' fileDoc['workflow'] = workflow try: results = oracleDB.get(task_path, data=encodeRequest(fileDoc)) except Exception as ex: logger.error("Failed to get acquired publications from oracleDB for %s: %s" % (workflow, ex)) return "FAILED" logger.info(results[0]['desc']['columns']) try: inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset") inputDataset = results[0]['result'][inputDatasetIndex] sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url") sourceURL = results[0]['result'][sourceURLIndex] publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url") publish_dbs_url = results[0]['result'][publish_dbs_urlIndex] #sourceURL = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader" if not sourceURL.endswith(READ_PATH) and not sourceURL.endswith(READ_PATH_1): sourceURL += READ_PATH except Exception: logger.exception("ERROR") # When looking up parents may need to look in global DBS as well. globalURL = sourceURL globalURL = globalURL.replace('phys01', 'global') globalURL = globalURL.replace('phys02', 'global') globalURL = globalURL.replace('phys03', 'global') globalURL = globalURL.replace('caf', 'global') pr = os.environ.get("SOCKS5_PROXY") logger.info(wfnamemsg+"Source API URL: %s" % sourceURL) sourceApi = dbsClient.DbsApi(url=sourceURL, proxy=pr) logger.info(wfnamemsg+"Global API URL: %s" % globalURL) globalApi = dbsClient.DbsApi(url=globalURL, proxy=pr) WRITE_PATH = "/DBSWriter" MIGRATE_PATH = "/DBSMigrate" READ_PATH = "/DBSReader" if publish_dbs_url.endswith(WRITE_PATH): publish_read_url = publish_dbs_url[:-len(WRITE_PATH)] + READ_PATH publish_migrate_url = publish_dbs_url[:-len(WRITE_PATH)] + MIGRATE_PATH else: publish_migrate_url = publish_dbs_url + MIGRATE_PATH publish_read_url = publish_dbs_url + READ_PATH publish_dbs_url += WRITE_PATH try: logger.debug(wfnamemsg+"Destination API URL: %s" % publish_dbs_url) destApi = dbsClient.DbsApi(url=publish_dbs_url, proxy=pr) logger.debug(wfnamemsg+"Destination read API URL: %s" % publish_read_url) destReadApi = dbsClient.DbsApi(url=publish_read_url, proxy=pr) logger.debug(wfnamemsg+"Migration API URL: %s" % publish_migrate_url) migrateApi = dbsClient.DbsApi(url=publish_migrate_url, proxy=pr) except: logger.exception('Wrong DBS URL %s' % publish_dbs_url) return "FAILED" logger.info("inputDataset: %s" % inputDataset) noInput = len(inputDataset.split("/")) <= 3 # TODO: fix dbs dep if not noInput: try: existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*') primary_ds_type = existing_datasets[0]['primary_ds_type'] # There's little chance this is correct, but it's our best guess for now. # CRAB2 uses 'crab2_tag' for all cases existing_output = destReadApi.listOutputConfigs(dataset=inputDataset) except: logger.exception('Wrong DBS URL %s' % publish_dbs_url) return "FAILED" if not existing_output: msg = "Unable to list output config for input dataset %s." % (inputDataset) logger.error(wfnamemsg+msg) global_tag = 'crab3_tag' else: global_tag = existing_output[0]['global_tag'] else: msg = "This publication appears to be for private MC." logger.info(wfnamemsg+msg) primary_ds_type = 'mc' global_tag = 'crab3_tag' acquisition_era_name = "CRAB" processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'} appName = 'cmsRun' appVer = toPublish[0]["swversion"] pset_hash = toPublish[0]['publishname'].split("-")[-1] gtag = str(toPublish[0]['globaltag']) if gtag == "None": gtag = global_tag try: if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]: acquisitionera = str(toPublish[0]['acquisitionera']) else: acquisitionera = acquisition_era_name except: acquisitionera = acquisition_era_name _, primName, procName, tier = toPublish[0]['outdataset'].split('/') primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type} msg = "About to insert primary dataset: %s" % (str(primds_config)) logger.debug(wfnamemsg+msg) destApi.insertPrimaryDataset(primds_config) msg = "Successfully inserted primary dataset %s." % (primName) logger.debug(wfnamemsg+msg) final = {} failed = [] publish_in_next_iteration = [] published = [] dataset = toPublish[0]['outdataset'] # Find all (valid) files already published in this dataset. try: existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True) existingFiles = [f['logical_file_name'] for f in existingDBSFiles] existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']] msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles)) msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid)) logger.info(wfnamemsg+msg) final['existingFiles'] = len(existingFiles) except Exception as ex: msg = "Error when listing files in DBS: %s" % (str(ex)) msg += "\n%s" % (str(traceback.format_exc())) logger.error(wfnamemsg+msg) return "FAILED" # check if actions are needed workToDo = False for fileTo in toPublish: #print(existingFilesValid) if fileTo['lfn'] not in existingFilesValid: workToDo = True break if not workToDo: msg = "Nothing uploaded, %s has these files already or not enough files." % (dataset) logger.info(wfnamemsg+msg) return "NOTHING TO DO" acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0} output_config = {'release_version': appVer, 'pset_hash': pset_hash, 'app_name': appName, 'output_module_label': 'o', 'global_tag': global_tag, } msg = "Published output config." logger.debug(wfnamemsg+msg) dataset_config = {'dataset': dataset, 'processed_ds_name': procName, 'data_tier_name': tier, 'acquisition_era_name': acquisitionera, 'dataset_access_type': 'VALID', 'physics_group_name': 'CRAB3', 'last_modification_date': int(time.time()), } msg = "About to insert dataset: %s" % (str(dataset_config)) logger.info(wfnamemsg+msg) del dataset_config['acquisition_era_name'] # List of all files that must (and can) be published. dbsFiles = [] dbsFiles_f = [] # Set of all the parent files from all the files requested to be published. parentFiles = set() # Set of parent files for which the migration to the destination DBS instance # should be skipped (because they were not found in DBS). parentsToSkip = set() # Set of parent files to migrate from the source DBS instance # to the destination DBS instance. localParentBlocks = set() # Set of parent files to migrate from the global DBS instance # to the destination DBS instance. globalParentBlocks = set() # Loop over all files to publish. for file_ in toPublish: logger.info(file_) # Check if this file was already published and if it is valid. if file_['lfn'] not in existingFilesValid: # We have a file to publish. # Get the parent files and for each parent file do the following: # 1) Add it to the list of parent files. # 2) Find the block to which it belongs and insert that block name in # (one of) the set of blocks to be migrated to the destination DBS. for parentFile in list(file_['parents']): if parentFile not in parentFiles: parentFiles.add(parentFile) # Is this parent file already in the destination DBS instance? # (If yes, then we don't have to migrate this block.) blocksDict = destReadApi.listBlocks(logical_file_name=parentFile) if not blocksDict: # No, this parent file is not in the destination DBS instance. # Maybe it is in the same DBS instance as the input dataset? blocksDict = sourceApi.listBlocks(logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in the same DBS instance as the input dataset. # Add the corresponding block to the set of blocks from the source DBS # instance that have to be migrated to the destination DBS. localParentBlocks.add(blocksDict[0]['block_name']) else: # No, this parent file is not in the same DBS instance as input dataset. # Maybe it is in global DBS instance? blocksDict = globalApi.listBlocks(logical_file_name=parentFile) if blocksDict: # Yes, this parent file is in global DBS instance. # Add the corresponding block to the set of blocks from global DBS # instance that have to be migrated to the destination DBS. globalParentBlocks.add(blocksDict[0]['block_name']) # If this parent file is not in the destination DBS instance, is not # the source DBS instance, and is not in global DBS instance, then it # means it is not known to DBS and therefore we can not migrate it. # Put it in the set of parent files for which migration should be skipped. if not blocksDict: parentsToSkip.add(parentFile) # If this parent file should not be migrated because it is not known to DBS, # we remove it from the list of parents in the file-to-publish info dictionary # (so that when publishing, this "parent" file will not appear as a parent). if parentFile in parentsToSkip: msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile) logger.info(wfnamemsg+msg) if parentFile in file_['parents']: file_['parents'].remove(parentFile) # Add this file to the list of files to be published. dbsFiles.append(format_file_3(file_)) dbsFiles_f.append(file_) #print file published.append(file_['SourceLFN']) #published.append(file_['lfn'].replace("/store","/store/temp")) # Print a message with the number of files to publish. msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles)) logger.info(wfnamemsg+msg) # If there are no files to publish, continue with the next dataset. if len(dbsFiles_f) == 0: msg = "Nothing to do for this dataset." logger.info(wfnamemsg+msg) return "NOTHING TO DO" # Migrate parent blocks before publishing. # First migrate the parent blocks that are in the same DBS instance # as the input dataset. if localParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks) logger.info(wfnamemsg+msg) statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, sourceApi, inputDataset, localParentBlocks ) if statusCode: failureMsg += " Not publishing any files." logger.info(wfnamemsg+failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f]) failure_reason = failureMsg published = [x for x in published[dataset] if x not in failed[dataset]] return "NOTHING TO DO" # Then migrate the parent blocks that are in the global DBS instance. if globalParentBlocks: msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks) logger.info(wfnamemsg+msg) statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks) if statusCode: failureMsg += " Not publishing any files." logger.info(wfnamemsg+failureMsg) failed.extend([f['SourceLFN'] for f in dbsFiles_f]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f]) failure_reason = failureMsg published = [x for x in published[dataset] if x not in failed[dataset]] return "NOTHING TO DO" # Publish the files in blocks. The blocks must have exactly max_files_per_block # files, unless there are less than max_files_per_block files to publish to # begin with. If there are more than max_files_per_block files to publish, # publish as many blocks as possible and leave the tail of files for the next # PublisherWorker call, unless forced to published. block_count = 0 count = 0 max_files_per_block = config.General.max_files_per_block while True: block_name = "%s#%s" % (dataset, str(uuid.uuid4())) files_to_publish = dbsFiles[count:count+max_files_per_block] try: block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0} msg = "Inserting files %s into block %s." % ([f['logical_file_name'] for f in files_to_publish], block_name) logger.info(wfnamemsg+msg) blockDump = createBulkBlock(output_config, processing_era_config, primds_config, dataset_config, acquisition_era_config, block_config, files_to_publish) #logger.debug(wfnamemsg+"Block to insert: %s\n %s" % (blockDump, destApi.__dict__ )) destApi.insertBulkBlock(blockDump) block_count += 1 except Exception as ex: #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish]) logger.error("Error for files: %s" % [f['lfn'] for f in toPublish]) failed.extend([f['SourceLFN'] for f in toPublish]) #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish]) msg = "Error when publishing (%s) " % ", ".join(failed) msg += str(ex) msg += str(traceback.format_exc()) logger.error(wfnamemsg+msg) failure_reason = str(ex) file='/tmp/failed-block-at-%s.txt' % time.time() with open(file,'write') as fd: fd.write(blockDump) logger.error("FAILING BLOCK SAVED AS %s" % file) count += max_files_per_block files_to_publish_next = dbsFiles_f[count:count+max_files_per_block] if len(files_to_publish_next) < max_files_per_block: publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next]) #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next]) break published = [x for x in published if x not in failed + publish_in_next_iteration] # Fill number of files/blocks published for this dataset. final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration) final['blocks'] = block_count # Print a publication status summary for this dataset. msg = "End of publication status for dataset %s:" % (dataset) msg += " failed (%s) %s" % (len(failed), failed) msg += ", published (%s) %s" % (len(published), published) msg += ", publish_in_next_iteration (%s) %s" % (len(publish_in_next_iteration), publish_in_next_iteration) msg += ", results %s" % (final) logger.info(wfnamemsg+msg) try: if published: mark_good(workflow, published, oracleDB, logger) if failed: logger.debug("Failed files: %s " % failed) mark_failed(failed, oracleDB, logger, failure_reason) except: logger.exception("Status update failed") return 0
def mark_failed(self, files=[], force_fail=False, submission_error=False): """ Something failed for these files so increment the retry count """ updated_lfn = [] for lfn in files: data = {} if not isinstance(lfn, dict): if 'temp' not in lfn: temp_lfn = lfn.replace('store', 'store/temp', 1) else: temp_lfn = lfn else: if 'temp' not in lfn['value'][0]: temp_lfn = lfn['value'][0].replace('store', 'store/temp', 1) else: temp_lfn = lfn['value'][0] # Load document and get the retry_count if self.config.isOracle: docId = getHashLfn(temp_lfn) self.logger.debug("Marking failed %s" % docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) except Exception as ex: self.logger.error("Error updating failed docs: %s" %ex) continue document = oracleOutputMapping(docbyId, None)[0] self.logger.debug("Document: %s" % document) fileDoc = dict() fileDoc['asoworker'] = self.config.asoworker fileDoc['subresource'] = 'updateTransfers' fileDoc['list_of_ids'] = docId if force_fail or document['transfer_retry_count'] + 1 > self.max_retry: fileDoc['list_of_transfer_state'] = 'FAILED' fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_transfer_state'] = 'RETRY' if submission_error: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" fileDoc['list_of_retry_value'] = 1 elif not self.valid_proxy: fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: user's proxy expired" fileDoc['list_of_retry_value'] = 1 else: fileDoc['list_of_failure_reason'] = "Site config problem." fileDoc['list_of_retry_value'] = 1 self.logger.debug("update: %s" % fileDoc) try: updated_lfn.append(docId) result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(fileDoc)) except Exception as ex: msg = "Error updating document" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue else: docId = getHashLfn(temp_lfn) try: document = self.db.document(docId) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': now = str(datetime.datetime.now()) last_update = time.time() # Prepare data to update the document in couch if force_fail or len(document['retry_count']) + 1 > self.max_retry: data['state'] = 'failed' else: data['state'] = 'retry' if submission_error: data['failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS" elif not self.valid_proxy: data['failure_reason'] = "Job could not be submitted to FTS: user's proxy expired" else: data['failure_reason'] = "Site config problem." data['last_update'] = last_update data['retry'] = now # Update the document in couch self.logger.debug("Marking failed %s" % docId) try: updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri=updateUri, type="PUT", decode=False) updated_lfn.append(docId) self.logger.debug("Marked failed %s" % docId) except Exception as ex: msg = "Error in updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.debug("failed file updated") return updated_lfn
def killThread(self, thread_id, transfers): """This is the worker thread function for kill command. """ while True: transfer_list = transfers.get() self.logger.info("Starting thread %s" % (thread_id)) user = transfer_list[0]['username'] group = transfer_list[0]['user_group'] role = transfer_list[0]['user_role'] uiSetupScript = getattr(self.config, 'UISetupScript', None) self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy)) try: userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if not userDN: transfers.task_done() time.sleep(1) continue self.logger.debug("user DN: %s" % userDN) try: defaultDelegation = {'logger': self.logger, 'credServerPath': self.config.credentialDir, 'myProxySvr': 'myproxy.cern.ch', 'min_time_left': getattr(self.config, 'minTimeLeft', 36000), 'serverDN': self.config.serverDN, 'uisource': uiSetupScript, 'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)} if hasattr(self.config, "cache_area"): cache_area = self.config.cache_area defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0] except IndexError: self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area) transfers.task_done() time.sleep(1) continue if getattr(self.config, 'serviceCert', None): defaultDelegation['server_cert'] = self.config.serviceCert if getattr(self.config, 'serviceKey', None): defaultDelegation['server_key'] = self.config.serviceKey try: defaultDelegation['userDN'] = userDN defaultDelegation['group'] = group if group else '' defaultDelegation['role'] = role if group else '' self.logger.debug('delegation: %s' % defaultDelegation) valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger) except Exception as ex: msg = "Error getting the user proxy" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) transfers.task_done() time.sleep(1) continue # TODO: take server from db, right now, take only the first of the list and assuming it valid for all try: # TODO: debug u added during info upload. To be fixed soon! For now worked around fts_server = transfer_list[0]['fts_instance'].split('u')[1] self.logger.info("Delegating proxy to %s" % fts_server) context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True) self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False)) self.logger.info("Proxy delegated. Grouping files by jobId") jobs = {} for fileToKill in transfer_list: # TODO: debug u added during info upload. To be fixed soon! For now worked around jid = str(fileToKill['fts_id']).split('u')[1] if jid not in jobs: jobs[jid] = [] jobs[jid].append(fileToKill) self.logger.info("Found %s jobIds", len(jobs.keys())) self.logger.debug("jobIds: %s", jobs.keys) # list for files killed or failed to killed = [] too_late = [] for ftsJobId, files in jobs.iteritems(): self.logger.info("Cancelling tranfers in %s" % ftsJobId) ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files] source_lfns = [x['source_lfn'] for x in files] job_list = fts3.get_job_status(context, ftsJobId, list_files=True) tx = job_list['files'] # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db? idListToKill = [x['file_id'] for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] # needed for the state update lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx if x['dest_surl'].split('/cms/store/')[1] in ref_lfns] self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill)) res = fts3.cancel(context, ftsJobId, idListToKill) self.logger.debug('Kill command result: %s' % json.dumps(res)) if not isinstance(res, list): res = [res] # Verify if the kill command succeeded for k, kill_res in enumerate(res): indexToUpdate = lfnListToKill[k] if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"): self.logger.debug(source_lfns[indexToUpdate]) too_late.append(getHashLfn(source_lfns[indexToUpdate])) else: killed.append(getHashLfn(source_lfns[indexToUpdate])) # TODO: decide how to update status for too_late files killed += too_late self.logger.debug('Updating status of killed files: %s' % killed) if len(killed) > 0: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = killed data['list_of_transfer_state'] = ["KILLED" for _ in killed] self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked killed %s" % killed) except: # TODO: split and improve try/except self.logger.exception('Kill command failed') transfers.task_done()
def mark_good(self, files): """ Mark the list of files as tranferred """ updated_lfn = [] good_ids = [] if len(files) == 0: return updated_lfn for it, lfn in enumerate(files): hash_lfn = getHashLfn(lfn) self.logger.info("Marking good %s" % hash_lfn) self.logger.debug("Marking good %s" % lfn) if not self.config.isOracle: try: document = self.db.document(hash_lfn) except Exception as ex: msg = "Error loading document from couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue self.logger.info("Doc %s Loaded" % hash_lfn) try: now = str(datetime.datetime.now()) last_update = time.time() if self.config.isOracle: docId = getHashLfn(lfn) good_ids.append(docId) updated_lfn.append(lfn) else: if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed': outputLfn = document['lfn'].replace('store/temp', 'store', 1) data = dict() data['end_time'] = now data['state'] = 'done' data['lfn'] = outputLfn data['last_update'] = last_update updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn) updateUri += "?" + urllib.urlencode(data) self.db.makeRequest(uri = updateUri, type = "PUT", decode = False) updated_lfn.append(lfn) self.logger.debug("Marked good %s" % lfn) else: updated_lfn.append(lfn) try: self.db.commit() except Exception as ex: msg = "Error commiting documents in couch" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue except Exception as ex: msg = "Error updating document" msg += str(ex) msg += str(traceback.format_exc()) self.logger.error(msg) continue if self.config.isOracle: try: data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updateTransfers' data['list_of_ids'] = good_ids data['list_of_transfer_state'] = ["DONE" for x in good_ids] result = self.oracleDB.post(self.config.oracleFileTrans, data=encodeRequest(data)) self.logger.debug("Marked good %s" % good_ids) except Exception: self.logger.exception('Error updating document') return {} self.logger.info("Transferred file %s updated, removing now source file" %docId) try: docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'), data=encodeRequest({'subresource': 'getById', 'id': docId})) document = oracleOutputMapping(docbyId, None)[0] except Exception: msg = "Error getting file from source" self.logger.exception(msg) return {} if document["source"] not in self.site_tfc_map: self.logger.debug("site not found... gathering info from phedex") self.site_tfc_map[document["source"]] = self.get_tfc_rules(document["source"]) pfn = self.apply_tfc_to_lfn( '%s:%s' %(document["source"], lfn)) self.logger.debug("File has to be removed now from source site: %s" %pfn) self.remove_files(self.userProxy, pfn) self.logger.debug("Transferred file removed from source") return updated_lfn
def testFileTransferPUT(self): """ _testFileTransferPUT_ Just test simple testFileTransferPUT with fake data """ # We just sent fake data which is not monitored by dashboard. # Also only the first time to decide is publication ON or NOT for user in self.users: timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) for i in range(self.totalFiles): now = int(time.time()) # Generate a taskname workflowName = "" taskname = "" if user not in self.tasks: workflowName = "".join([random.choice(string.ascii_lowercase) for _ in range(20)]) + "_" + str(now) publicationState = random.choice(['NEW', 'NOT_REQUIRED']) else: workflowName = self.tasks[user]['workflowName'] publicationState = self.tasks[user]['publication'] transferState = random.choice(['NEW', 'DONE']) taskname = generateTaskName(user, workflowName, timestamp) finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999)) idHash = getHashLfn(finalLfn) self.fileDoc['id'] = idHash self.fileDoc['job_id'] = i self.fileDoc['username'] = user self.fileDoc['taskname'] = taskname self.fileDoc['start_time'] = int(time.time()) self.fileDoc['source_lfn'] = finalLfn self.fileDoc['destination_lfn'] = finalLfn self.fileDoc['transfer_state'] = transferState self.fileDoc['publication_state'] = publicationState print(self.fileDoc) self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc)) # if I will put the same doc twice, it should raise an error. # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc)) # This tasks are for the future and next calls if user not in self.tasks: self.tasks[user] = {'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [], 'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles} if self.tasks[user]['publication'] == 'NEW': self.tasks[user]['toPublish'] += 1 if transferState == 'NEW': self.tasks[user]['toTransfer'] += 1 self.tasks[user]['listOfIds'].append(idHash) # This should raise an error for username in self.tasks: taskname = self.tasks[username]['taskname'] for query in ['getTransferStatus', 'getPublicationStatus']: result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': query, 'username': username, 'taskname': taskname})) print(result) print(result[0]['result']) taskInfoDict = oracleOutputMapping(result, 'id') print(taskInfoDict) for key, docDict in taskInfoDict.items(): result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getById', 'id': key})) randomUsers = random.sample(set(self.users), 3) # Take half of the users and kill their transfers for specific task for username in randomUsers: taskname = self.tasks[username]['taskname'] result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfers', 'username': username, 'taskname': taskname})) print(result) # oneUser is left for killing a list of IDs # leftUsers will be killing transfers one by one for specific id. leftUsers = list(set(self.users) - set(randomUsers)) oneUser = random.sample(set(leftUsers), 1) leftUsers = list(set(leftUsers) - set(oneUser)) for username in leftUsers: # First get all left ids for this users result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getTransferStatus', 'username': username, 'taskname': self.tasks[username]['taskname']})) resultOut = oracleOutputMapping(result, None) print("**"*50) for outDict in resultOut: print(outDict) result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': outDict['id']})) print(result) print(resultOut) print(result) for username in oneUser: result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById', 'username': username, 'listOfIds': self.tasks[username]['listOfIds']}, ['listOfIds'])) # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct print(result) print(self.tasks[username])
def submit(trans_tuple, job_data, log, direct=False): """Manage threads for transfers submission through Rucio :param trans_tuple: ordered list of needed xfer info (transfers, to_submit_columns) :type trans_tuple: tuple :param job_data: general CRAB job metadata :type job_data: dict :param log: log object :type log: logging :param direct: job output stored on temp or directly, defaults to False :param direct: bool, optional """ threadLock = threading.Lock() threads = [] to_update = [] toTrans = trans_tuple[0] columns = trans_tuple[1] proxy = job_data['proxy'] #rest_filetransfers = job_data['rest'] crabserver = job_data['crabserver'] user = job_data['username'] destination = job_data['destination'] taskname = job_data['taskname'] scope = 'user.' + user try: os.environ["X509_USER_PROXY"] = proxy log.info("Initializing Rucio client for %s", taskname) crabInj = CRABDataInjector(taskname, destination, account=user, scope=scope, auth_type='x509_proxy') except Exception as ex: log.error("Failed to load RUCIO client: %s", ex) raise ex # Split threads by source RSEs sources = list(set([x[columns.index('source')] for x in toTrans])) os.environ["X509_CERT_DIR"] = os.getcwd() # mapping lfn <--> pfn for source in sources: ids = [x[columns.index('id')] for x in toTrans if x[columns.index('source')] == source] src_lfns = [x[columns.index('source_lfn')] for x in toTrans if x[columns.index('source')] == source] dst_lfns = [x[columns.index('destination_lfn')] for x in toTrans if x[columns.index('source')] == source] sorted_source_pfns = [] sorted_dest_lfns = [] sorted_dest_pfns = [] # workaround for phedex.getPFN issue --> shuffling output order w.r.t. the list in input try: for chunk in chunks(src_lfns, 10): unsorted_source_pfns = [[k.split(scope+":")[1], str(x)] for k, x in crabInj.cli.lfns2pfns(source, [scope + ":" + y for y in chunk]).items()] #log.info(unsorted_source_pfns) for order_lfn in chunk: for lfn, pfn in unsorted_source_pfns: if order_lfn == lfn: sorted_source_pfns.append(pfn) break for chunk in chunks(dst_lfns, 10): unsorted_dest_pfns = [[k.split(scope+":")[1], str(x)] for k, x in crabInj.cli.lfns2pfns(toTrans[0][4], [scope + ":" + y for y in chunk]).items()] #log.info(unsorted_dest_pfns) for order_lfn in chunk: for lfn, pfn in unsorted_dest_pfns: if order_lfn == lfn: sorted_dest_pfns.append(pfn) sorted_dest_lfns.append(lfn) break except Exception as ex: log.error("Failed to map lfns to pfns: %s", ex) mark_failed(ids, ["Failed to map lfn to pfn: " + str(ex) for _ in ids], crabserver) source_pfns = sorted_source_pfns dest_lfns = sorted_dest_lfns # saving file sizes and checksums filesizes = [x[columns.index('filesize')] for x in toTrans if x[columns.index('source')] == source] checksums = [x[columns.index('checksums')] for x in toTrans if x[columns.index('source')] == source] pubnames = [x[columns.index('publishname')] for x in toTrans if x[columns.index('source')] == source] # ordered list of replicas information try: jobs = zip(source_pfns, dest_lfns, ids, checksums, filesizes, pubnames) except Exception as ex: log.error("Failed to gather all job information: %s", ex) job_columns = ['source_pfns', 'dest_lfns', 'ids', 'checksums', 'filesizes', 'pubnames'] # ordered list of transfers details tx_from_source = [[job, source, taskname, user, destination] for job in jobs] tx_columns = ['job', 'source', 'taskname', 'user', 'destination'] # split submission process in chunks of max 200 files for files in chunks(tx_from_source, 200): if not direct: log.info("Submitting: %s", files) thread = submit_thread(threadLock, log, (files, tx_columns), job_columns, proxy, to_update, crabInj) thread.start() threads.append(thread) elif direct: log.info("Registering direct stageout: %s", files) thread = submit_thread(threadLock, log, (files, tx_columns), job_columns, proxy, to_update, crabInj, direct=True) thread.start() threads.append(thread) for t in threads: t.join() if len(to_update) == 0: return False # update statuses in oracle table as per threads result for fileDoc in to_update: try: #TODO: split submitted from submitted failed! log.debug("POSTing to crabserver 'filetransfer' api:\n%s", encodeRequest(fileDoc)) crabserver.post('filetransfers', data=encodeRequest(fileDoc)) log.info("Marked submitted %s files" % (fileDoc['list_of_ids'])) except Exception: log.exception('Failed to mark files as submitted on DBs') return True
cmd = "curl -i -F file=@%s xsls.cern.ch"%xmllocation try: pu = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) break except Exception, e: logger.debug(str(e)) maxi = maxi + 1 continue if __name__ == "__main__": server = HTTPRequests('cmsweb-testbed.cern.ch', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy', '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy') result = server.get('/crabserver/preprod/filetransfers', data=encodeRequest({'subresource': 'groupedTransferStatistics', 'grouping': 0})) results = oracleOutputMapping(result) status = {'transfers':{}, 'publications':{}} tmp = {'transfers':{ 'DONE':0, 'ACQUIRED':0, 'SUBMITTED':0, 'FAILED':0, 'RETRY':0 }, 'publications':{'DONE':0, 'ACQUIRED':0, 'NEW':0, 'FAILED':0, 'RETRY':0}} #past = open("tmp_transfer") #tmp = json.load(past) for doc in results: if doc['aso_worker']=="asodciangot1": if 'transfers' in tmp and TRANSFERDB_STATES[doc['transfer_state']] in tmp['transfers']: status['transfers'][TRANSFERDB_STATES[doc['transfer_state']]] = - tmp['transfers'][TRANSFERDB_STATES[doc['transfer_state']]] + doc['nt']