def getParentDatasets(self, workflows): """ Given a list of requests, find which requests need to process a parent dataset, and discover what the parent dataset name is. :return: dictionary with the child and the parent dataset """ retryWorkflows = [] retryDatasets = [] datasetByDbs = {} parentByDset = {} for wflow in workflows: if wflow.hasParents(): datasetByDbs.setdefault(wflow.getDbsUrl(), set()) datasetByDbs[wflow.getDbsUrl()].add(wflow.getInputDataset()) for dbsUrl, datasets in datasetByDbs.items(): self.logger.info("Resolving %d dataset parentage against DBS: %s", len(datasets), dbsUrl) # first find out what's the parent dataset name parentByDset.update(findParent(datasets, dbsUrl)) # now check if any of our calls failed; if so, workflow needs to be skipped from this cycle # FIXME: isn't there a better way to do this?!? for dset, value in parentByDset.items(): if value is None: retryDatasets.append(dset) if retryDatasets: for wflow in workflows: if wflow.hasParents() and wflow.getInputDataset( ) in retryDatasets: retryWorkflows.append(wflow) # remove workflows that failed one or more of the bulk queries to the data-service self._workflowRemoval(workflows, retryWorkflows) return parentByDset
def setParentDatasets(self, wflow): """ Used to resolve parent datasets for a workflow. :param wflow: A MSRuleCleaner workflow representation :return: The workflow object """ if wflow['InputDataset'] and wflow['IncludeParents']: childDataset = wflow['InputDataset'] parentDataset = findParent([childDataset], self.msConfig['dbsUrl']) # NOTE: If findParent() returned None then the DBS service failed to # resolve the request (it is considered an ERROR outside WMCore) if parentDataset.get(childDataset, None) is None: msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName']) raise MSRuleCleanerResolveParentError(msg) elif parentDataset: wflow['ParentDataset'] = [parentDataset[childDataset]] msg = "Found parent %s for input dataset %s in workflow: %s " self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName']) else: msg = "Could not find parent for input dataset: %s in workflows: %s" self.logger.error(msg, wflow['InputDataset'], wflow['RequestName']) return wflow
def test_findParent(self): "Test function for findParent()" parents = findParent(self.child, self.dbsUrl) self.assertEqual(parents[self.child[0]], '/SingleElectron/Run2016B-v2/RAW')