def _decideDataDestination(self, wflow, dataIn, numNodes): """ Given a global list of blocks and the campaign configuration, decide which blocks have to be transferred and to where. :param wflow: workflow object :param dataIn: dictionary with a summary of the data to be placed :param numNodes: amount of nodes/RSEs that can receive data :return: yield a block list, the total chunk size and a node index """ # FIXME: implement multiple copies (MaxCopies > 1) blockList = [] dsetName = dataIn["name"] ### NOTE: data placement done in a block basis if dataIn["type"] == "primary": listBlockSets, listSetsSize = wflow.getChunkBlocks(numNodes) if not listBlockSets: self.logger.warning(" found 0 primary/parent blocks for dataset: %s, moving on...", dsetName) yield blockList, 0, 0 for idx, blocksSet in enumerate(listBlockSets): self.logger.info("Have a chunk of %d blocks (%s GB) for dataset: %s", len(blocksSet), gigaBytes(listSetsSize[idx]), dsetName) yield blocksSet, listSetsSize[idx], idx ### NOTE: data placement done in a dataset basis elif dataIn["type"] == "secondary": # secondary datasets are transferred as a whole, until better days... dsetSize = wflow.getSecondarySummary() dsetSize = dsetSize[dsetName]['dsetSize'] # randomly pick one of the PNNs to put the whole pileup dataset in idx = randint(0, numNodes - 1) self.logger.info("Have whole PU dataset: %s (%s GB)", dsetName, gigaBytes(dsetSize)) yield blockList, dsetSize, idx
def __init__(self, detoxUrl, dataAcct, quotaFraction, **kwargs): """ Executes a basic setup, including proper logging. :param detoxUrl: string with the detox url (to fetch the quota) :param dataAcct: string with either the Rucio account or PhEDEx group name :param quotaFraction: float point number representing the fraction of the quota :param kwargs: the supported keyword arguments are: minimumThreshold: integer value defining the minimum available space required useRucio: boolean flag used to decide between Rucio and PhEDEx data management verbose: logger verbosity logger: logger object """ self.detoxUrl = detoxUrl self.dataAcct = dataAcct self.quotaFraction = quotaFraction self.minimumSpace = kwargs["minimumThreshold"] self.useRucio = kwargs.get("useRucio", False) self.logger = getMSLogger(kwargs.get("verbose"), kwargs.get("logger")) msg = "RSEQuotas started with parameters: dataAcct=%s, quotaFraction=%s, " msg += "minimumThreshold=%s GB, useRucio=%s" self.logger.info(msg, dataAcct, quotaFraction, gigaBytes(self.minimumSpace), self.useRucio) self.nodeUsage = {} self.availableRSEs = set() self.outOfSpaceNodes = set()
def _checkPrimaryDataVolume(self, wflow, wflowPnns): """ Calculate the total data volume already available in the restricted list of PNNs, such that we can minimize primary/ parent data transfers :param wflow: a workflow object :param wflowPnns: set with the allowed PNNs to receive data :return: the PNN which contains most of the data already in """ msg = "Checking primary data volume for: %s, allowed PNNs: %s" self.logger.info(msg, wflow.getName(), wflowPnns) volumeByPNN = dict() for pnn in wflowPnns: volumeByPNN.setdefault(pnn, 0) for methodName in ("getPrimaryBlocks", "getParentBlocks"): inputBlocks = getattr(wflow, methodName)() self.logger.info("Request %s has %d initial blocks from %s", wflow.getName(), len(inputBlocks), methodName) for block, blockDict in inputBlocks.items(): blockLocation = self._diskPNNs(blockDict['locations']) commonLocation = wflowPnns & set(blockLocation) if not commonLocation: continue for pnn in commonLocation: volumeByPNN[pnn] += blockDict['blockSize'] maxSize = 0 finalPNN = set() self.logger.info("Primary/parent data volume currently available:") for pnn, size in volumeByPNN.items(): self.logger.info(" PNN: %s\t\tData volume: %s GB", pnn, gigaBytes(size)) if size > maxSize: maxSize = size finalPNN = {pnn} elif size == maxSize: finalPNN.add(pnn) self.logger.info( "The PNN that would require less data to be transferred is: %s", finalPNN) if len(finalPNN) > 1: # magically picks one site from the list. It could pick the one with highest # available quota, but that might overload that one site... # make sure it's a set object finalPNN = choice(list(finalPNN)) finalPNN = {finalPNN} self.logger.info("Randomly picked PNN: %s as final location", finalPNN) return finalPNN
def __init__(self, dataAcct, quotaFraction, **kwargs): """ Executes a basic setup, including proper logging. :param dataAcct: string with the Rucio account :param quotaFraction: float point number representing the fraction of the quota :param kwargs: the supported keyword arguments are: minimumThreshold: integer value defining the minimum available space required verbose: logger verbosity logger: logger object """ self.dataAcct = dataAcct self.quotaFraction = quotaFraction self.minimumSpace = kwargs["minimumThreshold"] self.logger = getMSLogger(kwargs.get("verbose"), kwargs.get("logger")) msg = "RSEQuotas started with parameters: dataAcct=%s, quotaFraction=%s, " msg += "minimumThreshold=%s GB" self.logger.info(msg, dataAcct, quotaFraction, gigaBytes(self.minimumSpace)) self.nodeUsage = {} self.availableRSEs = set() self.outOfSpaceNodes = set()
def checkPUDataLocation(self, wflow): """ Check the workflow configuration - in terms of AAA - and the secondary pileup distribution; and if possible remove the pileup dataset from the next step where data is placed. If workflow has XRootD/AAA enabled, data location can be outside of the SiteWhitelist. :param wflow: workflow object """ pileupInput = wflow.getSecondarySummary() if not pileupInput: # nothing to be done here return wflowPnns = self._getPNNs(wflow.getSitelist()) secondaryAAA = wflow.getReqParam("TrustPUSitelists") msg = "Checking secondary data location for request: {}, ".format( wflow.getName()) msg += "TrustPUSitelists: {}, request white/black list PNNs: {}".format( secondaryAAA, wflowPnns) self.logger.info(msg) if secondaryAAA: # what matters is to have pileup dataset(s) available in ANY disk storage for dset, dsetDict in pileupInput.items(): datasetLocation = self._diskPNNs(dsetDict['locations']) msg = "it has secondary: %s, total size: %s GB, disk locations: %s" self.logger.info(msg, dset, gigaBytes(dsetDict['dsetSize']), datasetLocation) if datasetLocation: self.logger.info( "secondary dataset %s already in place through AAA: %s", dset, datasetLocation) pileupInput.pop(dset) else: self.logger.info( "secondary dataset %s not available even through AAA", dset) else: if len(pileupInput) == 1: for dset, dsetDict in pileupInput.items(): datasetLocation = self._diskPNNs(dsetDict['locations']) msg = "it has secondary: %s, total size: %s GB, current disk locations: %s" self.logger.info(msg, dset, gigaBytes(dsetDict['dsetSize']), datasetLocation) commonLocation = wflowPnns & set(datasetLocation) if commonLocation: msg = "secondary dataset: %s already in place. " msg += "Common locations with site white/black list is: %s" self.logger.info(msg, dset, commonLocation) pileupInput.pop(dset) wflow.setPURSElist(commonLocation) else: self.logger.info( "secondary: %s will need data placement!!!", dset) elif len(pileupInput) >= 2: # then make sure multiple pileup datasets are available at the same location # Note: avoid transferring the biggest one largestSize = 0 largestDset = "" for dset, dsetDict in pileupInput.items(): if dsetDict['dsetSize'] > largestSize: largestSize = dsetDict['dsetSize'] largestDset = dset datasetLocation = self._diskPNNs( pileupInput[largestDset]['locations']) msg = "it has multiple pileup datasets, the largest one is: %s," msg += "total size: %s GB, current disk locations: %s" self.logger.info(msg, largestDset, gigaBytes(largestSize), datasetLocation) commonLocation = wflowPnns & set(datasetLocation) if commonLocation: self.logger.info( "Largest secondary dataset %s already in place: %s", largestDset, datasetLocation) pileupInput.pop(largestDset) wflow.setPURSElist(commonLocation) else: self.logger.info( "Largest secondary dataset %s not available in a common location. This is BAD!" ) # now iterate normally through the pileup datasets for dset, dsetDict in pileupInput.items(): datasetLocation = self._diskPNNs(dsetDict['locations']) msg = "it has secondary: %s, total size: %s GB, current disk locations: %s" self.logger.info(msg, dset, gigaBytes(dsetDict['dsetSize']), datasetLocation) commonLocation = wflowPnns & set(datasetLocation) if not commonLocation: msg = "secondary dataset: %s not in any common location. Its current locations are: %s" self.logger.info(msg, dset, datasetLocation) elif commonLocation and not wflow.getPURSElist(): # then it's the first pileup dataset available within the SiteWhitelist, # force its common location for the workflow from now on msg = "secondary dataset: %s already in place: %s, common location: %s" msg += ". Forcing the whole workflow to this new common location." self.logger.info(msg, dset, datasetLocation, commonLocation) pileupInput.pop(dset) wflow.setPURSElist(commonLocation) else: # pileup RSE list has already been defined. Get the new common location newCommonLocation = commonLocation & wflow.getPURSElist( ) if newCommonLocation: msg = "secondary dataset: %s already in place. " msg += "New common locations with site white/black list is: %s" self.logger.info(msg, dset, newCommonLocation) pileupInput.pop(dset) wflow.setPURSElist(newCommonLocation) else: msg = "secondary dataset: %s is currently available within the site white/black list: %s" msg += " But there is no common location with the other(s) pileup datasets: %s" msg += " It will need data placement!!!" self.logger.info(msg, dset, commonLocation, wflow.getPURSElist()) # check if there are remaining pileups to be placed # we need to figure out its location NOW! if wflow.getSecondarySummary() and not wflow.getPURSElist(): pnns = self._findFinalPULocation(wflow) wflow.setPURSElist(pnns)
def getChunkBlocks(self, numChunks=1): """ Break down the input and parent blocks by a given number of chunks (usually the amount of sites available for data placement). :param numChunks: integer representing the number of chunks to be created :return: it returns two lists: * a list of sets, where each set corresponds to a set of blocks to be transferred to a single location; * and a list integers, which references the total size of each chunk in the list above (same order). """ if numChunks == 1: thisChunk = set() thisChunk.update(list(self.getPrimaryBlocks())) thisChunkSize = sum([ blockInfo['blockSize'] for blockInfo in viewvalues(self.getPrimaryBlocks()) ]) if self.getParentDataset(): thisChunk.update(list(self.getParentBlocks())) thisChunkSize += sum([ blockInfo['blockSize'] for blockInfo in viewvalues(self.getParentBlocks()) ]) # keep same data structure as multiple chunks, so list of lists return [thisChunk], [thisChunkSize] # create a descendant list of blocks according to their sizes sortedPrimary = sorted(viewitems(self.getPrimaryBlocks()), key=operator.itemgetter(1), reverse=True) if len(sortedPrimary) < numChunks: msg = "There are less blocks than chunks to create. " msg += "Reducing numChunks from %d to %d" % (numChunks, len(sortedPrimary)) self.logger.info(msg) numChunks = len(sortedPrimary) chunkSize = sum(item[1]['blockSize'] for item in sortedPrimary) // numChunks self.logger.info("Found %d blocks and the avg chunkSize is: %s GB", len(sortedPrimary), gigaBytes(chunkSize)) # list of sets with the block names blockChunks = [] # list of integers with the total block sizes in each chunk (same order as above) sizeChunks = [] for i in range(numChunks): thisChunk = set() thisChunkSize = 0 idx = 0 while True: self.logger.debug("Chunk: %d and idx: %s and length: %s", i, idx, len(sortedPrimary)) if not sortedPrimary or idx >= len(sortedPrimary): # then all blocks have been distributed break elif not thisChunkSize: # then this site/chunk is empty, assign a block to it thisChunk.add(sortedPrimary[idx][0]) thisChunkSize += sortedPrimary[idx][1]['blockSize'] sortedPrimary.pop(idx) elif thisChunkSize + sortedPrimary[idx][1][ 'blockSize'] <= chunkSize: thisChunk.add(sortedPrimary[idx][0]) thisChunkSize += sortedPrimary[idx][1]['blockSize'] sortedPrimary.pop(idx) else: idx += 1 if thisChunk: blockChunks.append(thisChunk) sizeChunks.append(thisChunkSize) # now take care of the leftovers... in a round-robin style.... while sortedPrimary: for chunkNum in range(numChunks): blockChunks[chunkNum].add(sortedPrimary[0][0]) sizeChunks[chunkNum] += sortedPrimary[0][1]['blockSize'] sortedPrimary.pop(0) if not sortedPrimary: break self.logger.info("Created %d primary data chunks out of %d chunks", len(blockChunks), numChunks) self.logger.info(" with chunk size distribution: %s", sizeChunks) if not self.getParentDataset(): return blockChunks, sizeChunks # now add the parent blocks, considering that input blocks were evenly # distributed, I'd expect the same to automatically happen to the parents... childParent = self.getChildToParentBlocks() parentsSize = self.getParentBlocks() for chunkNum in range(numChunks): parentSet = set() for child in blockChunks[chunkNum]: parentSet.update(childParent[child]) # now with the final list of parents in hand, update the list # of blocks within the chunk and update the chunk size as well blockChunks[chunkNum].update(parentSet) for parent in parentSet: sizeChunks[chunkNum] += parentsSize[parent]['blockSize'] self.logger.info( "Created %d primary+parent data chunks out of %d chunks", len(blockChunks), numChunks) self.logger.info(" with chunk size distribution: %s", sizeChunks) return blockChunks, sizeChunks