def testWorkflowsInfo(self): "Test function for workflowsInfo()" state = 'assignment-approved' workflows = getWorkflows(state) winfo = workflowsInfo(workflows) # datasets = [d for row in winfo.values() for d in row['datasets']] # pileups = [d for row in winfo.values() for d in row['pileups']] keys = sorted(['datasets', 'pileups', 'priority', 'selist', 'campaign']) for wdict in winfo.values(): self.assertEqual(keys, sorted(wdict.keys()))
def testWorkflowsInfo(self): "Test function for workflowsInfo()" state = 'assignment-approved' workflows = getWorkflows(state) winfo = workflowsInfo(workflows) # datasets = [d for row in winfo.values() for d in row['datasets']] # pileups = [d for row in winfo.values() for d in row['pileups']] keys = sorted( ['datasets', 'pileups', 'priority', 'selist', 'campaign']) for wdict in winfo.values(): self.assertEqual(keys, sorted(wdict.keys()))
def unifiedUnused(self): """ FIXME FIXME TODO Leave this code in a different method until we evaluate what is needed and what is not, and refactor this thing... """ # FIXME making pylint happy, remove these assignments requestNames = [] uConfig = {} # requestNames = [r.getName() for r in workflows] # TODO: the logic below shows original unified port and it should be # revisited wrt new proposal specs and unified codebase # get workflows from list of requests orig = time.time() time0 = time.time() requestWorkflows = self._getRequestWorkflows(requestNames) requestWorkflows = requestWorkflows.values() self.logger.debug(elapsedTime(time0, "### getWorkflows")) # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(requestWorkflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes, _datasetTransfers = dbsInfo( datasets, self.msConfig['dbsUrl']) self.logger.debug(elapsedTime(time0, "### dbsInfo")) # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets, self.msConfig['phedexUrl']) self.logger.debug(elapsedTime(time0, "### phedexInfo")) # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets, self.msConfig['dbsUrl']) self.logger.debug(elapsedTime(time0, "### eventsLumisInfo")) # get specs for all requests and re-use them later in getSiteWhiteList as cache reqSpecs = self._getRequestSpecs(requestNames) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo(uConfig) requestsToProcess = [] tst0 = time.time() totBlocks = totEvents = totSize = totCpuT = 0 for wflow in requestWorkflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis, dbsUrl=self.msConfig['dbsUrl'], logger=self.logger) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, { 'num_event': 0, 'num_lumi': 0 }) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) self.logger.debug("### %s", wname) self.logger.debug( "%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s", ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = self._getSiteWhiteList(uConfig, wspec, siteInfo, reqSpecs) if not isinstance(primary, list): primary = [primary] if not isinstance(secondary, list): secondary = [secondary] wflowDatasets = primary + secondary wflowDatasetsBlocks = [] for dset in wflowDatasets: for item in datasetBlocks.get(dset, []): wflowDatasetsBlocks.append(item) rdict = dict(name=wname, datasets=wflowDatasets, blocks=wflowDatasetsBlocks, npileups=npileups, size=size, nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, sites=sites, allowedSites=allowedSites, parent=parent, lheInput=lheInput, primary=primary, secondary=secondary) requestsToProcess.append(rdict) self.logger.debug(elapsedTime(t0, "### getSiteWhiteList")) self.logger.debug( "total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)", len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT) self.logger.debug(elapsedTime(tst0, '### workflows info')) self.logger.debug(elapsedTime(orig, '### total time')) return requestsToProcess
def unified(svc, requestRecords, logger): """ Unified Transferror box Input parameters: :param requestRecords: list of request records, see definition in requestRecord :param logger: logger """ # get aux info for dataset/blocks from inputs/parents/pileups # make subscriptions based on site white/black lists logger.debug("### unified transferor") requests = [r['name'] for r in requestRecords] ### TODO: the logic below shows original unified port and it should be ### revisited wrt new proposal specs and unified codebase # get workflows from list of requests orig = time.time() time0 = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() logger.debug(elapsedTime(time0, "### getWorkflows")) # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) logger.debug(elapsedTime(time0, "### dbsInfo")) # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) logger.debug(elapsedTime(time0, "### phedexInfo")) # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) logger.debug(elapsedTime(time0, "### eventsLumisInfo")) # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requestsToProcess = [] totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, {'num_event': 0, 'num_lumi': 0}) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) logger.debug("### %s", wname) logger.debug("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s", ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(svc, wspec, siteInfo, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \ npileups=npileups, size=size, nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \ sites=sites, allowedSites=allowedSites, parent=parent, \ lheInput=lheInput, primary=primary, secondary=secondary) requestsToProcess.append(rdict) logger.debug(elapsedTime(t0, "### getSiteWhiteList")) logger.debug("total # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)", len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT) logger.debug(elapsedTime(tst0, '### workflows info')) logger.debug(elapsedTime(orig, '### total time')) return requestsToProcess
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'): """ Helper function to get information about all requests in assignment-approved state in ReqMgr """ # get list of known request in workqueue requestJobs = workqueueRequests(state) requests = requestJobs.keys() # get workflows from list of requests time0 = orig = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() elapsedTime(time0, "### getWorkflows") # time0 = orig = time.time() # workflows = getWorkflows(state) # elapsedTime(time0, "### getWorkflows") # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) elapsedTime(time0, "### dbsInfo") # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) elapsedTime(time0, "### phedexInfo") # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) elapsedTime(time0, "### eventsLumisInfo") # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requests = {} totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, {'num_event':0, 'num_lumi':0}) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) njobs = requestJobs[wname] print("\n### %s" % wname) print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \ % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks,\ npileups=npileups, size=size, njobs=njobs,\ nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies,\ sites=sites, allowedSites=allowedSites, parent=parent,\ lheInput=lheInput, primary=primary, secondary=secondary) requests[wname] = rdict print("sites", allowedSites) elapsedTime(t0, "getSiteWhiteList") print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \ % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)) elapsedTime(tst0, 'workflows info') elapsedTime(orig) return requests
def requestsInfo(reqmgrAuxSvc, state='assignment-approved'): """ Helper function to get information about all requests in assignment-approved state in ReqMgr """ # get list of known request in workqueue requestJobs = workqueueRequests(state) requests = requestJobs.keys() # get workflows from list of requests time0 = orig = time.time() requestWorkflows = getRequestWorkflows(requests) workflows = requestWorkflows.values() elapsedTime(time0, "### getWorkflows") # time0 = orig = time.time() # workflows = getWorkflows(state) # elapsedTime(time0, "### getWorkflows") # get workflows info summaries and collect datasets we need to process winfo = workflowsInfo(workflows) datasets = [d for row in winfo.values() for d in row['datasets']] # find dataset info time0 = time.time() datasetBlocks, datasetSizes = dbsInfo(datasets) elapsedTime(time0, "### dbsInfo") # find block nodes information for our datasets time0 = time.time() blockNodes = phedexInfo(datasets) elapsedTime(time0, "### phedexInfo") # find events-lumis info for our datasets time0 = time.time() eventsLumis = eventsLumisInfo(datasets) elapsedTime(time0, "### eventsLumisInfo") # get specs for all requests and re-use them later in getSiteWhiteList as cache requests = [v['RequestName'] for w in workflows for v in w.values()] reqSpecs = getRequestSpecs(requests) # get siteInfo instance once and re-use it later, it is time-consumed object siteInfo = SiteInfo() requests = {} totBlocks = totEvents = totSize = totCpuT = 0 tst0 = time.time() for wflow in workflows: for wname, wspec in wflow.items(): time0 = time.time() cput = getComputingTime(wspec, eventsLumis=eventsLumis) ncopies = getNCopies(cput) attrs = winfo[wname] ndatasets = len(attrs['datasets']) npileups = len(attrs['pileups']) nblocks = nevts = nlumis = size = 0 nodes = set() for dataset in attrs['datasets']: blocks = datasetBlocks[dataset] for blk in blocks: for node in blockNodes.get(blk, []): nodes.add(node) nblocks += len(blocks) size += datasetSizes[dataset] edata = eventsLumis.get(dataset, { 'num_event': 0, 'num_lumi': 0 }) nevts += edata['num_event'] nlumis += edata['num_lumi'] totBlocks += nblocks totEvents += nevts totSize += size totCpuT += cput sites = json.dumps(sorted(list(nodes))) njobs = requestJobs[wname] print("\n### %s" % wname) print("%s datasets, %s blocks, %s bytes (%s TB), %s nevts, %s nlumis, cput %s, copies %s, %s" \ % (ndatasets, nblocks, size, teraBytes(size), nevts, nlumis, cput, ncopies, sites)) # find out which site can serve given workflow request t0 = time.time() lheInput, primary, parent, secondary, allowedSites \ = getSiteWhiteList(wspec, siteInfo, reqmgrAuxSvc, reqSpecs) rdict = dict(name=wname, datasets=datasets, blocks=datasetBlocks, \ npileups=npileups, size=size, njobs=njobs, \ nevents=nevts, nlumis=nlumis, cput=cput, ncopies=ncopies, \ sites=sites, allowedSites=allowedSites, parent=parent, \ lheInput=lheInput, primary=primary, secondary=secondary) requests[wname] = rdict print("sites", allowedSites) elapsedTime(t0, "getSiteWhiteList") print("\ntotal # of workflows %s, datasets %s, blocks %s, evts %s, size %s (%s TB), cput %s (hours)" \ % (len(winfo.keys()), len(datasets), totBlocks, totEvents, totSize, teraBytes(totSize), totCpuT)) elapsedTime(tst0, 'workflows info') elapsedTime(orig) return requests