def makeWriteButler(cls, args: argparse.Namespace) -> Butler: """Return a read-write butler initialized to write to and read from the collections specified by the given command-line arguments. Parameters ---------- args : `argparse.Namespace` Parsed command-line arguments. See class documentation for the construction parameter of the same name. Returns ------- butler : `lsst.daf.butler.Butler` A read-write butler initialized according to the given arguments. """ butler = Butler(args.butler_config, writeable=True) self = cls(butler.registry, args, writeable=True) self.check(args) if self.output is not None: chainDefinition = list(self.output.chain if self.output.exists else self.inputs) if args.replace_run: replaced = chainDefinition.pop(0) if args.prune_replaced == "unstore": # Remove datasets from datastore with butler.transaction(): refs = butler.registry.queryDatasets(..., collections=replaced) butler.pruneDatasets(refs, unstore=True, run=replaced, disassociate=False) elif args.prune_replaced == "purge": # Erase entire collection and all datasets, need to remove # collection from its chain collection first. with butler.transaction(): butler.registry.setCollectionChain(self.output.name, chainDefinition) butler.pruneCollection(replaced, purge=True, unstore=True) elif args.prune_replaced is not None: raise NotImplementedError( f"Unsupported --prune-replaced option '{args.prune_replaced}'." ) if not self.output.exists: butler.registry.registerCollection(self.output.name, CollectionType.CHAINED) if not args.extend_run: butler.registry.registerCollection(self.outputRun.name, CollectionType.RUN) chainDefinition.insert(0, self.outputRun.name) butler.registry.setCollectionChain(self.output.name, chainDefinition) _LOG.debug("Preparing butler to write to '%s' and read from '%s'=%s", self.outputRun.name, self.output.name, chainDefinition) butler.registry.defaults = RegistryDefaults(run=self.outputRun.name, collections=self.output.name) else: inputs = CollectionSearch.fromExpression((self.outputRun.name,) + self.inputs) _LOG.debug("Preparing butler to write to '%s' and read from %s.", self.outputRun.name, inputs) butler.registry.defaults = RegistryDefaults(run=self.outputRun.name, collections=inputs) return butler
def testTransaction(self): butler = Butler(self.tmpConfigFile, run="ingest") datasetTypeName = "test_metric" dimensions = butler.registry.dimensions.extract( ["instrument", "visit"]) dimensionEntries = (("instrument", { "instrument": "DummyCam" }), ("physical_filter", { "instrument": "DummyCam", "name": "d-r", "abstract_filter": "R" }), ("visit", { "instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r" })) storageClass = self.storageClassFactory.getStorageClass( "StructuredData") metric = makeExampleMetrics() dataId = {"instrument": "DummyCam", "visit": 42} with self.assertRaises(TransactionTestError): with butler.transaction(): # Create and register a DatasetType datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) # Add needed Dimensions for args in dimensionEntries: butler.registry.insertDimensionData(*args) # Store a dataset ref = butler.put(metric, datasetTypeName, dataId) self.assertIsInstance(ref, DatasetRef) # Test getDirect metricOut = butler.getDirect(ref) self.assertEqual(metric, metricOut) # Test get metricOut = butler.get(datasetTypeName, dataId) self.assertEqual(metric, metricOut) # Check we can get components self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) raise TransactionTestError( "This should roll back the entire transaction") with self.assertRaises(KeyError): butler.registry.getDatasetType(datasetTypeName) with self.assertRaises(LookupError): butler.registry.expandDataId(dataId) # Should raise KeyError for missing DatasetType with self.assertRaises(KeyError): butler.get(datasetTypeName, dataId) # Also check explicitly if Dataset entry is missing self.assertIsNone( butler.registry.find(butler.collection, datasetType, dataId)) # Direct retrieval should not find the file in the Datastore with self.assertRaises(FileNotFoundError): butler.getDirect(ref)
def ingestSimulated(repo, locations, regex, output_run, transfer="auto", ingest_type="rawexp"): """Ingests raw frames into the butler registry Parameters ---------- repo : `str` URI to the repository. locations : `list` [`str`] Files to ingest and directories to search for files that match ``regex`` to ingest. regex : `str` Regex string used to find files in directories listed in locations. output_run : `str` The path to the location, the run, where datasets should be put. transfer : `str` or None The external data transfer type, by default "auto". ingest_type : `str` ingest product data type. Raises ------ Exception Raised if operations on configuration object fail. Notes ----- This method inserts all datasets for an exposure within a transaction, guaranteeing that partial exposures are never ingested. The exposure dimension record is inserted with `Registry.syncDimensionData` first (in its own transaction), which inserts only if a record with the same primary key does not already exist. This allows different files within the same exposure to be incremented in different runs. """ butler = Butler(repo, writeable=True) # make sure instrument and detector dimensions are populated with butler.registry.transaction(): instrument_record = { "name": "simulator", "exposure_max": 600000, "detector_max": 6, "class_name": "spherex.instrument.SimulatorInstrument" } butler.registry.syncDimensionData("instrument", instrument_record) for idx in range(1, 7): detector_record = { "instrument": "simulator", "id": idx, "full_name": f"array{idx}" } butler.registry.syncDimensionData("detector", detector_record) dimension_universe = butler.registry.dimensions datasetType = DatasetType(ingest_type, dimension_universe.extract( ("instrument", "detector", "exposure")), "SPHERExImage", universe=dimension_universe) # idempotent dataset type registration butler.registry.registerDatasetType(datasetType) # idempotent collection registration run = f"{ingest_type}r" if (output_run is None) else output_run butler.registry.registerCollection(run, type=CollectionType.RUN) n_failed = 0 files = findFileResources(locations, regex) # example: sim_exposure_000000_array_1.fits or # sim_exposure_000000_array_2_dark_current.fits pattern = re.compile(r"sim_exposure_(\d+)_array_(\d)[_,.]") # do we want to group observations? grp = datetime.date.today().strftime("%Y%m%d") datasets = [] for file in files: # parse exposure and detector ids from file name m = pattern.search(file) if m is None: n_failed += 1 logging.error(f"{file} does not match simulator file pattern") continue else: g = m.groups() if len(g) != 2: n_failed += 1 logging.error( f"Unable to get exposure and detector from file name: {file}" ) continue else: [exposure_id, detector_id] = list(map(int, g)) try: exposure_record = { "instrument": "simulator", "id": exposure_id, "name": f"{exposure_id:06d}", "group_name": f"{grp}", "timespan": Timespan(begin=None, end=None) } # idempotent insertion of individual dimension rows butler.registry.syncDimensionData("exposure", exposure_record) except Exception as e: n_failed += 1 logging.error( f"Unable to insert exposure record for file {file}: {e}") continue dataId = DataCoordinate.standardize( instrument="simulator", detector=detector_id, exposure=exposure_id, universe=butler.registry.dimensions) ref = DatasetRef(datasetType, dataId=dataId) datasets.append( FileDataset(refs=ref, path=file, formatter=AstropyImageFormatter)) with butler.transaction(): butler.ingest(*datasets, transfer=transfer, run=run)
class FormattersTests(DatasetTestHelper, lsst.utils.tests.TestCase): root = None storageClassFactory = None @classmethod def setUpClass(cls): """Create a new butler once only.""" cls.storageClassFactory = StorageClassFactory() cls.root = tempfile.mkdtemp(dir=TESTDIR) data_ids = { "instrument": [INSTRUMENT_NAME], "detector": [0, 1, 2, 3, 4, 5], "exposure": [11, 22], } configURI = ButlerURI("resource://spherex/configs", forceDirectory=True) butlerConfig = Config(configURI.join("butler.yaml")) # in-memory db is being phased out # butlerConfig["registry", "db"] = 'sqlite:///:memory:' cls.creatorButler = makeTestRepo( cls.root, data_ids, config=butlerConfig, dimensionConfig=configURI.join("dimensions.yaml")) for formatter in FORMATTERS: datasetTypeName, storageClassName = (formatter["dataset_type"], formatter["storage_class"]) storageClass = cls.storageClassFactory.getStorageClass( storageClassName) addDatasetType(cls.creatorButler, datasetTypeName, set(data_ids), storageClass) @classmethod def tearDownClass(cls): if cls.root is not None: shutil.rmtree(cls.root, ignore_errors=True) def setUp(self): # make test collection # self.butler = makeTestCollection(self.creatorButler) self.collection = self._testMethodName self.butler = Butler(butler=self.creatorButler, run=self.collection) def test_putget(self): fitsPath = os.path.join(TESTDIR, "data", "small.fits") dataid = {"exposure": 11, "detector": 0, "instrument": INSTRUMENT_NAME} for formatter in FORMATTERS: # in-memory object, representing fits inmemobj = formatter["reader"](fitsPath) # save in-memory object into butler dataset datasetTypeName = formatter["dataset_type"] self.butler.put(inmemobj, datasetTypeName, dataid) # get butler dataset retrievedobj = self.butler.get(datasetTypeName, dataid) self.assertTrue(isinstance(retrievedobj, formatter["inmem_cls"])) self.assertTrue(retrievedobj.__class__.__name__, inmemobj.__class__.__name__) def test_ingest(self): fitsPath = os.path.join(TESTDIR, "data", "small.fits") formatter = FORMATTERS[0] datasetTypeName, formatterCls = (formatter["dataset_type"], formatter["formatter_cls"]) datasetType = self.butler.registry.getDatasetType(datasetTypeName) datasets = [] for exposure in range(3, 5): for detector in range(6): # use the same fits to test ingest if not os.path.exists(fitsPath): log.warning( f"No data found for detector {detector}, exposure {exposure} @ {fitsPath}." ) continue ref = DatasetRef(datasetType, dataId={ "instrument": INSTRUMENT_NAME, "detector": detector, "exposure": exposure * 11 }) datasets.append( FileDataset(refs=ref, path=fitsPath, formatter=formatterCls)) # register new collection # run = "rawIngestedRun" # self.butler.registry.registerCollection(run, type=CollectionType.RUN) # collection is registered as a part of setUp run = self.collection with self.butler.transaction(): for exposure in range(3, 5): expid = exposure * 11 self.butler.registry.insertDimensionData( "exposure", { "instrument": INSTRUMENT_NAME, "id": expid, "name": f"{expid}", "group_name": "day1", "timespan": Timespan(begin=None, end=None) }) # transfer can be 'auto', 'move', 'copy', 'hardlink', 'relsymlink' # or 'symlink' self.butler.ingest(*datasets, transfer="symlink", run=run) # verify that 12 files were ingested (2 exposures for each detector) refsSet = set( self.butler.registry.queryDatasets(datasetTypeName, collections=[run])) self.assertEqual( len(refsSet), 12, f"Collection {run} should have 12 elements after ingest") # verify that data id is present dataid = {"exposure": 44, "detector": 5, "instrument": INSTRUMENT_NAME} refsList = list( self.butler.registry.queryDatasets(datasetTypeName, collections=[run], dataId=dataid)) self.assertEqual( len(refsList), 1, f"Collection {run} should have 1 element with {dataid}")