def setUp(self): self.id = 0 self.factory = FormatterFactory() self.universe = DimensionUniverse() self.dataId = DataCoordinate.makeEmpty(self.universe) # Dummy FileDescriptor for testing getFormatter self.fileDescriptor = FileDescriptor( Location("/a/b/c", "d"), StorageClass("DummyStorageClass", dict, None))
def makeGraph(self, pipeline, collections, run, userQuery): """Create execution graph for a pipeline. Parameters ---------- pipeline : `Pipeline` Pipeline definition, task names/classes and their configs. collections Expressions representing the collections to search for input datasets. May be any of the types accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. run : `str`, optional Name of the `~lsst.daf.butler.CollectionType.RUN` collection for output datasets, if it already exists. userQuery : `str` String which defines user-defined selection for registry, should be empty or `None` if there is no restrictions on data selection. Returns ------- graph : `QuantumGraph` Raises ------ UserExpressionError Raised when user expression cannot be parsed. OutputExistsError Raised when output datasets already exist. Exception Other exceptions types may be raised by underlying registry classes. """ scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) instrument = pipeline.getInstrument() if isinstance(instrument, str): instrument = doImport(instrument) if instrument is not None: dataId = DataCoordinate.standardize( instrument=instrument.getName(), universe=self.registry.dimensions) else: dataId = DataCoordinate.makeEmpty(self.registry.dimensions) with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds: scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, skipExisting=self.skipExisting) return scaffolding.makeQuantumGraph()
def testStandardize(self): """Test constructing a DataCoordinate from many different kinds of input via `DataCoordinate.standardize` and `DataCoordinate.subset`. """ for n in range(5): dimensions = self.randomDimensionSubset() dataIds = self.randomDataIds(n=1).subset(dimensions) split = self.splitByStateFlags(dataIds) for m, dataId in enumerate(split.chain()): # Passing in any kind of DataCoordinate alone just returns # that object. self.assertIs(dataId, DataCoordinate.standardize(dataId)) # Same if we also explicitly pass the dimensions we want. self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph)) # Same if we pass the dimensions and some irrelevant # kwargs. self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph, htm7=12)) # Test constructing a new data ID from this one with a # subset of the dimensions. # This is not possible for some combinations of # dimensions if hasFull is False (see # `DataCoordinate.subset` docs). newDimensions = self.randomDimensionSubset(n=1, graph=dataId.graph) if dataId.hasFull() or dataId.graph.required.issuperset(newDimensions.required): newDataIds = [ dataId.subset(newDimensions), DataCoordinate.standardize(dataId, graph=newDimensions), DataCoordinate.standardize(dataId, graph=newDimensions, htm7=12), ] for newDataId in newDataIds: with self.subTest(newDataId=newDataId, type=type(dataId)): commonKeys = dataId.keys() & newDataId.keys() self.assertTrue(commonKeys) self.assertEqual( [newDataId[k] for k in commonKeys], [dataId[k] for k in commonKeys], ) # This should never "downgrade" from # Complete to Minimal or Expanded to Complete. if dataId.hasRecords(): self.assertTrue(newDataId.hasRecords()) if dataId.hasFull(): self.assertTrue(newDataId.hasFull()) # Start from a complete data ID, and pass its values in via several # different ways that should be equivalent. for dataId in split.complete: # Split the keys (dimension names) into two random subsets, so # we can pass some as kwargs below. keys1 = set(self.rng.sample(list(dataId.graph.dimensions.names), len(dataId.graph.dimensions)//2)) keys2 = dataId.graph.dimensions.names - keys1 newCompleteDataIds = [ DataCoordinate.standardize(dataId.full.byName(), universe=dataId.universe), DataCoordinate.standardize(dataId.full.byName(), graph=dataId.graph), DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe), **dataId.full.byName()), DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe), graph=dataId.graph, **dataId.full.byName()), DataCoordinate.standardize(**dataId.full.byName(), universe=dataId.universe), DataCoordinate.standardize(graph=dataId.graph, **dataId.full.byName()), DataCoordinate.standardize( {k: dataId[k] for k in keys1}, universe=dataId.universe, **{k: dataId[k] for k in keys2} ), DataCoordinate.standardize( {k: dataId[k] for k in keys1}, graph=dataId.graph, **{k: dataId[k] for k in keys2} ), ] for newDataId in newCompleteDataIds: with self.subTest(dataId=dataId, newDataId=newDataId, type=type(dataId)): self.assertEqual(dataId, newDataId) self.assertTrue(newDataId.hasFull())
def setUp(self): config = Config( { "version": 1, "namespace": "pipe_base_test", "skypix": { "common": "htm7", "htm": { "class": "lsst.sphgeom.HtmPixelization", "max_level": 24, }, }, "elements": { "A": { "keys": [ { "name": "id", "type": "int", } ], "storage": { "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage", }, }, "B": { "keys": [ { "name": "id", "type": "int", } ], "storage": { "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage", }, }, }, "packers": {}, } ) universe = DimensionUniverse(config=config) # need to make a mapping of TaskDef to set of quantum quantumMap = {} tasks = [] for task, label in ( (Dummy1PipelineTask, "R"), (Dummy2PipelineTask, "S"), (Dummy3PipelineTask, "T"), (Dummy4PipelineTask, "U"), ): config = task.ConfigClass() taskDef = TaskDef(get_full_type_name(task), config, task, label) tasks.append(taskDef) quantumSet = set() connections = taskDef.connections for a, b in ((1, 2), (3, 4)): if connections.initInputs: initInputDSType = DatasetType( connections.initInput.name, tuple(), storageClass=connections.initInput.storageClass, universe=universe, ) initRefs = [DatasetRef(initInputDSType, DataCoordinate.makeEmpty(universe))] else: initRefs = None inputDSType = DatasetType( connections.input.name, connections.input.dimensions, storageClass=connections.input.storageClass, universe=universe, ) inputRefs = [ DatasetRef(inputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe)) ] outputDSType = DatasetType( connections.output.name, connections.output.dimensions, storageClass=connections.output.storageClass, universe=universe, ) outputRefs = [ DatasetRef(outputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe)) ] quantumSet.add( Quantum( taskName=task.__qualname__, dataId=DataCoordinate.standardize({"A": a, "B": b}, universe=universe), taskClass=task, initInputs=initRefs, inputs={inputDSType: inputRefs}, outputs={outputDSType: outputRefs}, ) ) quantumMap[taskDef] = quantumSet self.tasks = tasks self.quantumMap = quantumMap self.qGraph = QuantumGraph(quantumMap, metadata=METADATA) self.universe = universe
def _accumulate( graph: QuantumGraph, dataset_types: PipelineDatasetTypes, ) -> Tuple[Set[DatasetRef], DataSetTypeMap]: # accumulate the DatasetRefs that will be transferred to the execution # registry # exports holds all the existing data that will be migrated to the # execution butler exports: Set[DatasetRef] = set() # inserts is the mapping of DatasetType to dataIds for what is to be # inserted into the registry. These are the products that are expected # to be produced during processing of the QuantumGraph inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) # It is possible to end up with a graph that has different storage # classes attached to the same dataset type name. This is okay but # must we must ensure that only a single dataset type definition is # accumulated in the loop below. This data structure caches every dataset # type encountered and stores the compatible alternative. datasetTypes: dict[Union[str, DatasetType], DatasetType] = {} # Add inserts for initOutputs (including initIntermediates); these are # defined fully by their DatasetType, because they have no dimensions, and # they are by definition not resolved. initInputs are part of Quantum and # that's the only place the graph stores the dataset IDs, so we process # them there even though each Quantum for a task has the same ones. for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): dataset_type = _validate_dataset_type(dataset_type, datasetTypes) inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) n: QuantumNode for quantum in (n.quantum for n in graph): for attrName in ("initInputs", "inputs", "outputs"): attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) for type, refs in attr.items(): # This if block is because init inputs has a different # signature for its items if not isinstance(refs, list): refs = [refs] # iterate over all the references, if it has an id, it # means it exists and should be exported, if not it should # be inserted into the new registry for ref in refs: if ref.id is not None: # If this is a component we want the composite to be # exported. if ref.isComponent(): ref = ref.makeCompositeRef() exports.add(ref) else: if ref.isComponent(): # We can't insert a component, and a component will # be part of some other upstream dataset, so it # should be safe to skip them here continue type = _validate_dataset_type(type, datasetTypes) inserts[type].add(ref.dataId) return exports, inserts
def connectDataIds(self, registry, collections, userQuery, externalDataId): """Query for the data IDs that connect nodes in the `QuantumGraph`. This method populates `_TaskScaffolding.dataIds` and `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). Parameters ---------- registry : `lsst.daf.butler.Registry` Registry for the data repository; used for all data ID queries. collections Expressions representing the collections to search for input datasets. May be any of the types accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. userQuery : `str` or `None` User-provided expression to limit the data IDs processed. externalDataId : `DataCoordinate` Externally-provided data ID that should be used to restrict the results, just as if these constraints had been included via ``AND`` in ``userQuery``. This includes (at least) any instrument named in the pipeline definition. Returns ------- commonDataIds : \ `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` An interface to a database temporary table containing all data IDs that will appear in this `QuantumGraph`. Returned inside a context manager, which will drop the temporary table at the end of the `with` block in which this method is called. """ _LOG.debug("Building query for data IDs.") # Initialization datasets always have empty data IDs. emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) for datasetType, refs in itertools.chain( self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items()): refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) # Run one big query for the data IDs for task dimensions and regular # inputs and outputs. We limit the query to only dimensions that are # associated with the input dataset types, but don't (yet) try to # obtain the dataset_ids for those inputs. _LOG.debug("Submitting data ID query and materializing results.") with registry.queryDataIds( self.dimensions, datasets=list(self.inputs), collections=collections, where=userQuery, dataId=externalDataId, ).materialize() as commonDataIds: _LOG.debug("Expanding data IDs.") commonDataIds = commonDataIds.expanded() _LOG.debug( "Iterating over query results to associate quanta with datasets." ) # Iterate over query results, populating data IDs for datasets and # quanta and then connecting them to each other. n = 0 for n, commonDataId in enumerate(commonDataIds): # Create DatasetRefs for all DatasetTypes from this result row, # noting that we might have created some already. # We remember both those that already existed and those that we # create now. refsForRow = {} for datasetType, refs in itertools.chain( self.inputs.items(), self.intermediates.items(), self.outputs.items()): datasetDataId = commonDataId.subset(datasetType.dimensions) ref = refs.get(datasetDataId) if ref is None: ref = DatasetRef(datasetType, datasetDataId) refs[datasetDataId] = ref refsForRow[datasetType.name] = ref # Create _QuantumScaffolding objects for all tasks from this # result row, noting that we might have created some already. for task in self.tasks: quantumDataId = commonDataId.subset(task.dimensions) quantum = task.quanta.get(quantumDataId) if quantum is None: quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) task.quanta[quantumDataId] = quantum # Whether this is a new quantum or an existing one, we can # now associate the DatasetRefs for this row with it. The # fact that a Quantum data ID and a dataset data ID both # came from the same result row is what tells us they # should be associated. # Many of these associates will be duplicates (because # another query row that differed from this one only in # irrelevant dimensions already added them), and we use # sets to skip. for datasetType in task.inputs: ref = refsForRow[datasetType.name] quantum.inputs[datasetType.name][ref.dataId] = ref for datasetType in task.outputs: ref = refsForRow[datasetType.name] quantum.outputs[datasetType.name][ref.dataId] = ref _LOG.debug("Finished processing %d rows from data ID query.", n) yield commonDataIds