except lsst.daf.butler.registry.MissingCollectionError: pass # Already removed; nothing to do logging.info("Preparing destination repository %s...", DEST_DIR) _remove_refcat_run(dest_repo, DEST_RUN) dest_repo.registry.registerCollection(DEST_RUN, CollectionType.RUN) for src_cat, dest_cat in REFCATS.items(): src_type = src_repo.registry.getDatasetType(src_cat) dest_type = _rename_dataset_type(src_type, dest_cat) dest_repo.registry.registerDatasetType(dest_type) dest_repo.registry.refresh() logging.info("Searching for refcats in %s:%s...", args.src_dir, args.src_collection) query = f"htm{HTM_LEVEL} in ({','.join(id_ranges)})" datasets = [] for src_ref in src_repo.registry.queryDatasets(REFCATS.keys(), where=query, findFirst=True): src_type = src_ref.datasetType dest_type = _rename_dataset_type(src_type, REFCATS[src_type.name]) dest_ref = DatasetRef(dest_type, src_ref.dataId) datasets.append(FileDataset(path=src_repo.getURI(src_ref), refs=dest_ref)) logging.info("Copying refcats...") dest_repo.ingest(*datasets, transfer="copy") logging.info("%d refcat shards copied to %s:%s", len(datasets), DEST_DIR, DEST_RUN)
def ingestSimulated(repo, locations, regex, output_run, transfer="auto", ingest_type="rawexp"): """Ingests raw frames into the butler registry Parameters ---------- repo : `str` URI to the repository. locations : `list` [`str`] Files to ingest and directories to search for files that match ``regex`` to ingest. regex : `str` Regex string used to find files in directories listed in locations. output_run : `str` The path to the location, the run, where datasets should be put. transfer : `str` or None The external data transfer type, by default "auto". ingest_type : `str` ingest product data type. Raises ------ Exception Raised if operations on configuration object fail. Notes ----- This method inserts all datasets for an exposure within a transaction, guaranteeing that partial exposures are never ingested. The exposure dimension record is inserted with `Registry.syncDimensionData` first (in its own transaction), which inserts only if a record with the same primary key does not already exist. This allows different files within the same exposure to be incremented in different runs. """ butler = Butler(repo, writeable=True) # make sure instrument and detector dimensions are populated with butler.registry.transaction(): instrument_record = { "name": "simulator", "exposure_max": 600000, "detector_max": 6, "class_name": "spherex.instrument.SimulatorInstrument" } butler.registry.syncDimensionData("instrument", instrument_record) for idx in range(1, 7): detector_record = { "instrument": "simulator", "id": idx, "full_name": f"array{idx}" } butler.registry.syncDimensionData("detector", detector_record) dimension_universe = butler.registry.dimensions datasetType = DatasetType(ingest_type, dimension_universe.extract( ("instrument", "detector", "exposure")), "SPHERExImage", universe=dimension_universe) # idempotent dataset type registration butler.registry.registerDatasetType(datasetType) # idempotent collection registration run = f"{ingest_type}r" if (output_run is None) else output_run butler.registry.registerCollection(run, type=CollectionType.RUN) n_failed = 0 files = findFileResources(locations, regex) # example: sim_exposure_000000_array_1.fits or # sim_exposure_000000_array_2_dark_current.fits pattern = re.compile(r"sim_exposure_(\d+)_array_(\d)[_,.]") # do we want to group observations? grp = datetime.date.today().strftime("%Y%m%d") datasets = [] for file in files: # parse exposure and detector ids from file name m = pattern.search(file) if m is None: n_failed += 1 logging.error(f"{file} does not match simulator file pattern") continue else: g = m.groups() if len(g) != 2: n_failed += 1 logging.error( f"Unable to get exposure and detector from file name: {file}" ) continue else: [exposure_id, detector_id] = list(map(int, g)) try: exposure_record = { "instrument": "simulator", "id": exposure_id, "name": f"{exposure_id:06d}", "group_name": f"{grp}", "timespan": Timespan(begin=None, end=None) } # idempotent insertion of individual dimension rows butler.registry.syncDimensionData("exposure", exposure_record) except Exception as e: n_failed += 1 logging.error( f"Unable to insert exposure record for file {file}: {e}") continue dataId = DataCoordinate.standardize( instrument="simulator", detector=detector_id, exposure=exposure_id, universe=butler.registry.dimensions) ref = DatasetRef(datasetType, dataId=dataId) datasets.append( FileDataset(refs=ref, path=file, formatter=AstropyImageFormatter)) with butler.transaction(): butler.ingest(*datasets, transfer=transfer, run=run)
def testIngest(self): butler = Butler(self.tmpConfigFile, run="ingest") # Create and register a DatasetType dimensions = butler.registry.dimensions.extract( ["instrument", "visit", "detector"]) storageClass = self.storageClassFactory.getStorageClass( "StructuredDataDictYaml") datasetTypeName = "metric" datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) # Add needed Dimensions butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) butler.registry.insertDimensionData("physical_filter", { "instrument": "DummyCamComp", "name": "d-r", "abstract_filter": "R" }) for detector in (1, 2): butler.registry.insertDimensionData( "detector", { "instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}" }) butler.registry.insertDimensionData( "visit", { "instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r" }, { "instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r" }) formatter = doImport( "lsst.daf.butler.formatters.yamlFormatter.YamlFormatter") dataRoot = os.path.join(TESTDIR, "data", "basic") datasets = [] for detector in (1, 2): detector_name = f"detector_{detector}" metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") dataId = { "instrument": "DummyCamComp", "visit": 423, "detector": detector } # Create a DatasetRef for ingest refIn = DatasetRef(datasetType, dataId, id=None) datasets.append( FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) butler.ingest(*datasets, transfer="copy") dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} metrics1 = butler.get(datasetTypeName, dataId1) metrics2 = butler.get(datasetTypeName, dataId2) self.assertNotEqual(metrics1, metrics2) # Compare URIs uri1 = butler.getUri(datasetTypeName, dataId1) uri2 = butler.getUri(datasetTypeName, dataId2) self.assertNotEqual(uri1, uri2) # Now do a multi-dataset but single file ingest metricFile = os.path.join(dataRoot, "detectors.yaml") refs = [] for detector in (1, 2): detector_name = f"detector_{detector}" dataId = { "instrument": "DummyCamComp", "visit": 424, "detector": detector } # Create a DatasetRef for ingest refs.append(DatasetRef(datasetType, dataId, id=None)) datasets = [] datasets.append( FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter)) butler.ingest(*datasets, transfer="copy") dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} multi1 = butler.get(datasetTypeName, dataId1) multi2 = butler.get(datasetTypeName, dataId2) self.assertEqual(multi1, metrics1) self.assertEqual(multi2, metrics2) # Compare URIs uri1 = butler.getUri(datasetTypeName, dataId1) uri2 = butler.getUri(datasetTypeName, dataId2) self.assertEqual(uri1, uri2) # Test that removing one does not break the second butler.remove(datasetTypeName, dataId1) with self.assertRaises(LookupError): butler.datasetExists(datasetTypeName, dataId1) self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) multi2b = butler.get(datasetTypeName, dataId2) self.assertEqual(multi2, multi2b)
class FormattersTests(DatasetTestHelper, lsst.utils.tests.TestCase): root = None storageClassFactory = None @classmethod def setUpClass(cls): """Create a new butler once only.""" cls.storageClassFactory = StorageClassFactory() cls.root = tempfile.mkdtemp(dir=TESTDIR) data_ids = { "instrument": [INSTRUMENT_NAME], "detector": [0, 1, 2, 3, 4, 5], "exposure": [11, 22], } configURI = ButlerURI("resource://spherex/configs", forceDirectory=True) butlerConfig = Config(configURI.join("butler.yaml")) # in-memory db is being phased out # butlerConfig["registry", "db"] = 'sqlite:///:memory:' cls.creatorButler = makeTestRepo( cls.root, data_ids, config=butlerConfig, dimensionConfig=configURI.join("dimensions.yaml")) for formatter in FORMATTERS: datasetTypeName, storageClassName = (formatter["dataset_type"], formatter["storage_class"]) storageClass = cls.storageClassFactory.getStorageClass( storageClassName) addDatasetType(cls.creatorButler, datasetTypeName, set(data_ids), storageClass) @classmethod def tearDownClass(cls): if cls.root is not None: shutil.rmtree(cls.root, ignore_errors=True) def setUp(self): # make test collection # self.butler = makeTestCollection(self.creatorButler) self.collection = self._testMethodName self.butler = Butler(butler=self.creatorButler, run=self.collection) def test_putget(self): fitsPath = os.path.join(TESTDIR, "data", "small.fits") dataid = {"exposure": 11, "detector": 0, "instrument": INSTRUMENT_NAME} for formatter in FORMATTERS: # in-memory object, representing fits inmemobj = formatter["reader"](fitsPath) # save in-memory object into butler dataset datasetTypeName = formatter["dataset_type"] self.butler.put(inmemobj, datasetTypeName, dataid) # get butler dataset retrievedobj = self.butler.get(datasetTypeName, dataid) self.assertTrue(isinstance(retrievedobj, formatter["inmem_cls"])) self.assertTrue(retrievedobj.__class__.__name__, inmemobj.__class__.__name__) def test_ingest(self): fitsPath = os.path.join(TESTDIR, "data", "small.fits") formatter = FORMATTERS[0] datasetTypeName, formatterCls = (formatter["dataset_type"], formatter["formatter_cls"]) datasetType = self.butler.registry.getDatasetType(datasetTypeName) datasets = [] for exposure in range(3, 5): for detector in range(6): # use the same fits to test ingest if not os.path.exists(fitsPath): log.warning( f"No data found for detector {detector}, exposure {exposure} @ {fitsPath}." ) continue ref = DatasetRef(datasetType, dataId={ "instrument": INSTRUMENT_NAME, "detector": detector, "exposure": exposure * 11 }) datasets.append( FileDataset(refs=ref, path=fitsPath, formatter=formatterCls)) # register new collection # run = "rawIngestedRun" # self.butler.registry.registerCollection(run, type=CollectionType.RUN) # collection is registered as a part of setUp run = self.collection with self.butler.transaction(): for exposure in range(3, 5): expid = exposure * 11 self.butler.registry.insertDimensionData( "exposure", { "instrument": INSTRUMENT_NAME, "id": expid, "name": f"{expid}", "group_name": "day1", "timespan": Timespan(begin=None, end=None) }) # transfer can be 'auto', 'move', 'copy', 'hardlink', 'relsymlink' # or 'symlink' self.butler.ingest(*datasets, transfer="symlink", run=run) # verify that 12 files were ingested (2 exposures for each detector) refsSet = set( self.butler.registry.queryDatasets(datasetTypeName, collections=[run])) self.assertEqual( len(refsSet), 12, f"Collection {run} should have 12 elements after ingest") # verify that data id is present dataid = {"exposure": 44, "detector": 5, "instrument": INSTRUMENT_NAME} refsList = list( self.butler.registry.queryDatasets(datasetTypeName, collections=[run], dataId=dataid)) self.assertEqual( len(refsList), 1, f"Collection {run} should have 1 element with {dataid}")