示例#1
0
    def makeWriteButler(cls, args: argparse.Namespace) -> Butler:
        """Return a read-write butler initialized to write to and read from
        the collections specified by the given command-line arguments.

        Parameters
        ----------
        args : `argparse.Namespace`
            Parsed command-line arguments.  See class documentation for the
            construction parameter of the same name.

        Returns
        -------
        butler : `lsst.daf.butler.Butler`
            A read-write butler initialized according to the given arguments.
        """
        butler = Butler(args.butler_config, writeable=True)
        self = cls(butler.registry, args, writeable=True)
        self.check(args)
        if self.output is not None:
            chainDefinition = list(self.output.chain if self.output.exists else self.inputs)
            if args.replace_run:
                replaced = chainDefinition.pop(0)
                if args.prune_replaced == "unstore":
                    # Remove datasets from datastore
                    with butler.transaction():
                        refs = butler.registry.queryDatasets(..., collections=replaced)
                        butler.pruneDatasets(refs, unstore=True, run=replaced, disassociate=False)
                elif args.prune_replaced == "purge":
                    # Erase entire collection and all datasets, need to remove
                    # collection from its chain collection first.
                    with butler.transaction():
                        butler.registry.setCollectionChain(self.output.name, chainDefinition)
                        butler.pruneCollection(replaced, purge=True, unstore=True)
                elif args.prune_replaced is not None:
                    raise NotImplementedError(
                        f"Unsupported --prune-replaced option '{args.prune_replaced}'."
                    )
            if not self.output.exists:
                butler.registry.registerCollection(self.output.name, CollectionType.CHAINED)
            if not args.extend_run:
                butler.registry.registerCollection(self.outputRun.name, CollectionType.RUN)
                chainDefinition.insert(0, self.outputRun.name)
                butler.registry.setCollectionChain(self.output.name, chainDefinition)
            _LOG.debug("Preparing butler to write to '%s' and read from '%s'=%s",
                       self.outputRun.name, self.output.name, chainDefinition)
            butler.registry.defaults = RegistryDefaults(run=self.outputRun.name, collections=self.output.name)
        else:
            inputs = CollectionSearch.fromExpression((self.outputRun.name,) + self.inputs)
            _LOG.debug("Preparing butler to write to '%s' and read from %s.", self.outputRun.name, inputs)
            butler.registry.defaults = RegistryDefaults(run=self.outputRun.name, collections=inputs)
        return butler
    def testTransaction(self):
        butler = Butler(self.tmpConfigFile, run="ingest")
        datasetTypeName = "test_metric"
        dimensions = butler.registry.dimensions.extract(
            ["instrument", "visit"])
        dimensionEntries = (("instrument", {
            "instrument": "DummyCam"
        }), ("physical_filter", {
            "instrument": "DummyCam",
            "name": "d-r",
            "abstract_filter": "R"
        }), ("visit", {
            "instrument": "DummyCam",
            "id": 42,
            "name": "fortytwo",
            "physical_filter": "d-r"
        }))
        storageClass = self.storageClassFactory.getStorageClass(
            "StructuredData")
        metric = makeExampleMetrics()
        dataId = {"instrument": "DummyCam", "visit": 42}
        with self.assertRaises(TransactionTestError):
            with butler.transaction():
                # Create and register a DatasetType
                datasetType = self.addDatasetType(datasetTypeName, dimensions,
                                                  storageClass,
                                                  butler.registry)
                # Add needed Dimensions
                for args in dimensionEntries:
                    butler.registry.insertDimensionData(*args)
                # Store a dataset
                ref = butler.put(metric, datasetTypeName, dataId)
                self.assertIsInstance(ref, DatasetRef)
                # Test getDirect
                metricOut = butler.getDirect(ref)
                self.assertEqual(metric, metricOut)
                # Test get
                metricOut = butler.get(datasetTypeName, dataId)
                self.assertEqual(metric, metricOut)
                # Check we can get components
                self.assertGetComponents(butler, ref,
                                         ("summary", "data", "output"), metric)
                raise TransactionTestError(
                    "This should roll back the entire transaction")

        with self.assertRaises(KeyError):
            butler.registry.getDatasetType(datasetTypeName)
        with self.assertRaises(LookupError):
            butler.registry.expandDataId(dataId)
        # Should raise KeyError for missing DatasetType
        with self.assertRaises(KeyError):
            butler.get(datasetTypeName, dataId)
        # Also check explicitly if Dataset entry is missing
        self.assertIsNone(
            butler.registry.find(butler.collection, datasetType, dataId))
        # Direct retrieval should not find the file in the Datastore
        with self.assertRaises(FileNotFoundError):
            butler.getDirect(ref)
示例#3
0
def ingestSimulated(repo,
                    locations,
                    regex,
                    output_run,
                    transfer="auto",
                    ingest_type="rawexp"):
    """Ingests raw frames into the butler registry

    Parameters
    ----------
    repo : `str`
        URI to the repository.
    locations : `list` [`str`]
        Files to ingest and directories to search for files that match
        ``regex`` to ingest.
    regex : `str`
        Regex string used to find files in directories listed in locations.
    output_run : `str`
        The path to the location, the run, where datasets should be put.
    transfer : `str` or None
        The external data transfer type, by default "auto".
    ingest_type : `str`
        ingest product data type.

    Raises
    ------
    Exception
        Raised if operations on configuration object fail.

    Notes
    -----
    This method inserts all datasets for an exposure within a transaction,
    guaranteeing that partial exposures are never ingested.  The exposure
    dimension record is inserted with `Registry.syncDimensionData` first
    (in its own transaction), which inserts only if a record with the same
    primary key does not already exist.  This allows different files within
    the same exposure to be incremented in different runs.
    """

    butler = Butler(repo, writeable=True)

    # make sure instrument and detector dimensions are populated
    with butler.registry.transaction():
        instrument_record = {
            "name": "simulator",
            "exposure_max": 600000,
            "detector_max": 6,
            "class_name": "spherex.instrument.SimulatorInstrument"
        }
        butler.registry.syncDimensionData("instrument", instrument_record)
        for idx in range(1, 7):
            detector_record = {
                "instrument": "simulator",
                "id": idx,
                "full_name": f"array{idx}"
            }
            butler.registry.syncDimensionData("detector", detector_record)

    dimension_universe = butler.registry.dimensions
    datasetType = DatasetType(ingest_type,
                              dimension_universe.extract(
                                  ("instrument", "detector", "exposure")),
                              "SPHERExImage",
                              universe=dimension_universe)
    # idempotent dataset type registration
    butler.registry.registerDatasetType(datasetType)

    # idempotent collection registration
    run = f"{ingest_type}r" if (output_run is None) else output_run
    butler.registry.registerCollection(run, type=CollectionType.RUN)

    n_failed = 0
    files = findFileResources(locations, regex)

    # example: sim_exposure_000000_array_1.fits or
    #   sim_exposure_000000_array_2_dark_current.fits
    pattern = re.compile(r"sim_exposure_(\d+)_array_(\d)[_,.]")

    # do we want to group observations?
    grp = datetime.date.today().strftime("%Y%m%d")

    datasets = []
    for file in files:
        # parse exposure and detector ids from file name
        m = pattern.search(file)
        if m is None:
            n_failed += 1
            logging.error(f"{file} does not match simulator file pattern")
            continue
        else:
            g = m.groups()
            if len(g) != 2:
                n_failed += 1
                logging.error(
                    f"Unable to get exposure and detector from file name: {file}"
                )
                continue
            else:
                [exposure_id, detector_id] = list(map(int, g))

        try:
            exposure_record = {
                "instrument": "simulator",
                "id": exposure_id,
                "name": f"{exposure_id:06d}",
                "group_name": f"{grp}",
                "timespan": Timespan(begin=None, end=None)
            }
            # idempotent insertion of individual dimension rows
            butler.registry.syncDimensionData("exposure", exposure_record)
        except Exception as e:
            n_failed += 1
            logging.error(
                f"Unable to insert exposure record for file {file}: {e}")
            continue

        dataId = DataCoordinate.standardize(
            instrument="simulator",
            detector=detector_id,
            exposure=exposure_id,
            universe=butler.registry.dimensions)
        ref = DatasetRef(datasetType, dataId=dataId)
        datasets.append(
            FileDataset(refs=ref, path=file, formatter=AstropyImageFormatter))

    with butler.transaction():
        butler.ingest(*datasets, transfer=transfer, run=run)
class FormattersTests(DatasetTestHelper, lsst.utils.tests.TestCase):
    root = None
    storageClassFactory = None

    @classmethod
    def setUpClass(cls):
        """Create a new butler once only."""

        cls.storageClassFactory = StorageClassFactory()

        cls.root = tempfile.mkdtemp(dir=TESTDIR)

        data_ids = {
            "instrument": [INSTRUMENT_NAME],
            "detector": [0, 1, 2, 3, 4, 5],
            "exposure": [11, 22],
        }

        configURI = ButlerURI("resource://spherex/configs",
                              forceDirectory=True)
        butlerConfig = Config(configURI.join("butler.yaml"))
        # in-memory db is being phased out
        # butlerConfig["registry", "db"] = 'sqlite:///:memory:'
        cls.creatorButler = makeTestRepo(
            cls.root,
            data_ids,
            config=butlerConfig,
            dimensionConfig=configURI.join("dimensions.yaml"))
        for formatter in FORMATTERS:
            datasetTypeName, storageClassName = (formatter["dataset_type"],
                                                 formatter["storage_class"])
            storageClass = cls.storageClassFactory.getStorageClass(
                storageClassName)
            addDatasetType(cls.creatorButler, datasetTypeName, set(data_ids),
                           storageClass)

    @classmethod
    def tearDownClass(cls):
        if cls.root is not None:
            shutil.rmtree(cls.root, ignore_errors=True)

    def setUp(self):
        # make test collection
        # self.butler = makeTestCollection(self.creatorButler)
        self.collection = self._testMethodName
        self.butler = Butler(butler=self.creatorButler, run=self.collection)

    def test_putget(self):
        fitsPath = os.path.join(TESTDIR, "data", "small.fits")
        dataid = {"exposure": 11, "detector": 0, "instrument": INSTRUMENT_NAME}
        for formatter in FORMATTERS:
            # in-memory object, representing fits
            inmemobj = formatter["reader"](fitsPath)

            # save in-memory object into butler dataset
            datasetTypeName = formatter["dataset_type"]
            self.butler.put(inmemobj, datasetTypeName, dataid)

            # get butler dataset
            retrievedobj = self.butler.get(datasetTypeName, dataid)
            self.assertTrue(isinstance(retrievedobj, formatter["inmem_cls"]))
            self.assertTrue(retrievedobj.__class__.__name__,
                            inmemobj.__class__.__name__)

    def test_ingest(self):

        fitsPath = os.path.join(TESTDIR, "data", "small.fits")

        formatter = FORMATTERS[0]
        datasetTypeName, formatterCls = (formatter["dataset_type"],
                                         formatter["formatter_cls"])

        datasetType = self.butler.registry.getDatasetType(datasetTypeName)
        datasets = []
        for exposure in range(3, 5):
            for detector in range(6):
                # use the same fits to test ingest
                if not os.path.exists(fitsPath):
                    log.warning(
                        f"No data found for detector {detector}, exposure {exposure} @ {fitsPath}."
                    )
                    continue
                ref = DatasetRef(datasetType,
                                 dataId={
                                     "instrument": INSTRUMENT_NAME,
                                     "detector": detector,
                                     "exposure": exposure * 11
                                 })
                datasets.append(
                    FileDataset(refs=ref,
                                path=fitsPath,
                                formatter=formatterCls))

        # register new collection
        # run = "rawIngestedRun"
        # self.butler.registry.registerCollection(run, type=CollectionType.RUN)

        # collection is registered as a part of setUp
        run = self.collection

        with self.butler.transaction():
            for exposure in range(3, 5):
                expid = exposure * 11
                self.butler.registry.insertDimensionData(
                    "exposure", {
                        "instrument": INSTRUMENT_NAME,
                        "id": expid,
                        "name": f"{expid}",
                        "group_name": "day1",
                        "timespan": Timespan(begin=None, end=None)
                    })
            # transfer can be 'auto', 'move', 'copy', 'hardlink', 'relsymlink'
            # or 'symlink'
            self.butler.ingest(*datasets, transfer="symlink", run=run)

        # verify that 12 files were ingested (2 exposures for each detector)
        refsSet = set(
            self.butler.registry.queryDatasets(datasetTypeName,
                                               collections=[run]))
        self.assertEqual(
            len(refsSet), 12,
            f"Collection {run} should have 12 elements after ingest")

        # verify that data id is present
        dataid = {"exposure": 44, "detector": 5, "instrument": INSTRUMENT_NAME}
        refsList = list(
            self.butler.registry.queryDatasets(datasetTypeName,
                                               collections=[run],
                                               dataId=dataid))
        self.assertEqual(
            len(refsList), 1,
            f"Collection {run} should have 1 element with {dataid}")