def __init__(self, config, registry, butlerRoot=None): super().__init__(config, registry, butlerRoot) if not os.path.isdir(self.root): if "create" not in self.config or not self.config["create"]: raise ValueError(f"No valid root at: {self.root}") safeMakeDir(self.root)
def __init__(self, config, registry): super().__init__(config, registry) self.root = self.config['root'] if not os.path.isdir(self.root): if 'create' not in self.config or not self.config['create']: raise ValueError("No valid root at: {0}".format(self.root)) safeMakeDir(self.root) self.locationFactory = LocationFactory(self.root) self.formatterFactory = FormatterFactory() self.storageClassFactory = StorageClassFactory() # Now associate formatters with storage classes for name, f in self.config["formatters"].items(): self.formatterFactory.registerFormatter(name, f) # Read the file naming templates self.templates = FileTemplates(self.config["templates"]) # Name ourselves self.name = "POSIXDatastore@{}".format(self.root) # Storage of paths and formatters, keyed by dataset_id types = { "path": str, "formatter": str, "storage_class": str, "dataset_id": int } self.records = DatabaseDict.fromConfig(self.config["records"], types=types, value=self.RecordTuple, key="dataset_id", registry=registry)
def __init__(self, config, registry, butlerRoot=None): super().__init__(config, registry) if "root" not in self.config: raise ValueError("No root directory specified in configuration") # Name ourselves either using an explicit name or a name # derived from the (unexpanded) root if "name" in self.config: self.name = self.config["name"] else: self.name = "POSIXDatastore@{}".format(self.config["root"]) # Support repository relocation in config self.root = replaceRoot(self.config["root"], butlerRoot) if not os.path.isdir(self.root): if "create" not in self.config or not self.config["create"]: raise ValueError(f"No valid root at: {self.root}") safeMakeDir(self.root) self.locationFactory = LocationFactory(self.root) self.formatterFactory = FormatterFactory() self.storageClassFactory = StorageClassFactory() # Now associate formatters with storage classes self.formatterFactory.registerFormatters( self.config["formatters"], universe=self.registry.dimensions) # Read the file naming templates self.templates = FileTemplates(self.config["templates"], universe=self.registry.dimensions) # And read the constraints list constraintsConfig = self.config.get("constraints") self.constraints = Constraints(constraintsConfig, universe=self.registry.dimensions) # Storage of paths and formatters, keyed by dataset_id types = { "path": str, "formatter": str, "storage_class": str, "file_size": int, "checksum": str, "dataset_id": int } lengths = { "path": 256, "formatter": 128, "storage_class": 64, "checksum": 128 } self.records = DatabaseDict.fromConfig(self.config["records"], types=types, value=self.RecordTuple, key="dataset_id", lengths=lengths, registry=registry)
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. """ datasetType = ref.datasetType typeName = datasetType.name storageClass = datasetType.storageClass # Sanity check if not isinstance(inMemoryDataset, storageClass.pytype): raise ValueError("Inconsistency between supplied object ({}) " "and storage class type ({})".format( type(inMemoryDataset), storageClass.pytype)) # Work out output file name template = self.templates.getTemplate(typeName) location = self.locationFactory.fromPath(template.format(ref)) # Get the formatter based on the storage class formatter = self.formatterFactory.getFormatter( datasetType.storageClass, typeName) storageDir = os.path.dirname(location.path) if not os.path.isdir(storageDir): safeMakeDir(storageDir) # Write the file path = formatter.write( inMemoryDataset, FileDescriptor(location, storageClass=storageClass)) # Create Storage information in the registry ospath = os.path.join(self.root, path) checksum = self.computeChecksum(ospath) stat = os.stat(ospath) size = stat.st_size info = StorageInfo(self.name, checksum, size) self.registry.addStorageInfo(ref, info) # Associate this dataset with the formatter for later read. fileInfo = StoredFileInfo(formatter, path, storageClass) self.addStoredFileInfo(ref, fileInfo) # Register all components with same information for compRef in ref.components.values(): self.registry.addStorageInfo(compRef, info) self.addStoredFileInfo(compRef, fileInfo)
def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], transfer: Optional[str] = None) -> StoredFileInfo: # Docstring inherited from FileLikeDatastore._extractIngestInfo. fullPath = os.path.normpath(os.path.join(self.root, path)) if transfer is not None: template = self.templates.getTemplate(ref) location = self.locationFactory.fromPath(template.format(ref)) newPath = formatter.predictPathFromLocation(location) newFullPath = os.path.join(self.root, newPath) if os.path.exists(newFullPath): raise FileExistsError(f"File '{newFullPath}' already exists.") storageDir = os.path.dirname(newFullPath) if not os.path.isdir(storageDir): with self._transaction.undoWith("mkdir", os.rmdir, storageDir): safeMakeDir(storageDir) if transfer == "move": with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): shutil.move(fullPath, newFullPath) elif transfer == "copy": with self._transaction.undoWith("copy", os.remove, newFullPath): shutil.copy(fullPath, newFullPath) elif transfer == "hardlink": with self._transaction.undoWith("hardlink", os.unlink, newFullPath): os.link(fullPath, newFullPath) elif transfer == "symlink": with self._transaction.undoWith("symlink", os.unlink, newFullPath): os.symlink(fullPath, newFullPath) else: raise NotImplementedError( "Transfer type '{}' not supported.".format(transfer)) path = newPath fullPath = newFullPath if self.useChecksum: checksum = self.computeChecksum(fullPath) else: checksum = None stat = os.stat(fullPath) size = stat.st_size return StoredFileInfo(formatter=formatter, path=path, storageClass=ref.datasetType.storageClass, file_size=size, checksum=checksum)
def __init__(self, config, registry, butlerRoot=None): super().__init__(config, registry, butlerRoot) # Check that root is a valid URI for this datastore root = ButlerURI(self.root) if root.scheme and root.scheme != "file": raise ValueError( f"Root location must only be a file URI not {self.root}") self.root = root.path if not os.path.isdir(self.root): if "create" not in self.config or not self.config["create"]: raise ValueError(f"No valid root at: {self.root}") safeMakeDir(self.root)
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. Raises ------ TypeError Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. Notes ----- If the datastore is configured to reject certain dataset types it is possible that the put will fail and raise a `DatasetTypeNotSupportedError`. The main use case for this is to allow `ChainedDatastore` to put to multiple datastores without requiring that every datastore accepts the dataset. """ location, formatter = self._prepare_for_put(inMemoryDataset, ref) storageDir = os.path.dirname(location.path) if not os.path.isdir(storageDir): with self._transaction.undoWith("mkdir", os.rmdir, storageDir): safeMakeDir(storageDir) # Write the file predictedFullPath = os.path.join(self.root, formatter.predictPath()) if os.path.exists(predictedFullPath): raise FileExistsError( f"Cannot write file for ref {ref} as " f"output file {predictedFullPath} already exists") with self._transaction.undoWith("write", os.remove, predictedFullPath): path = formatter.write(inMemoryDataset) assert predictedFullPath == os.path.join(self.root, path) log.debug("Wrote file to %s", path) self.ingest(path, ref, formatter=formatter)
def setUp(self): self.root = tempfile.mkdtemp(dir=TESTDIR) # Make a new repository in one place self.dir1 = os.path.join(self.root, "dir1") Butler.makeRepo(self.dir1, config=Config(self.configFile)) # Move the yaml file to a different place and add a "root" self.dir2 = os.path.join(self.root, "dir2") safeMakeDir(self.dir2) configFile1 = os.path.join(self.dir1, "butler.yaml") config = Config(configFile1) config["root"] = self.dir1 configFile2 = os.path.join(self.dir2, "butler2.yaml") config.dumpToFile(configFile2) os.remove(configFile1) self.tmpConfigFile = configFile2
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. """ datasetType = ref.datasetType typeName = datasetType.name storageClass = datasetType.storageClass # Sanity check if not isinstance(inMemoryDataset, storageClass.pytype): raise ValueError("Inconsistency between supplied object ({}) " "and storage class type ({})".format( type(inMemoryDataset), storageClass.pytype)) # Work out output file name template = self.templates.getTemplate(typeName) location = self.locationFactory.fromPath(template.format(ref)) # Get the formatter based on the storage class formatter = self.formatterFactory.getFormatter( datasetType.storageClass, typeName) storageDir = os.path.dirname(location.path) if not os.path.isdir(storageDir): safeMakeDir(storageDir) # Write the file path = formatter.write( inMemoryDataset, FileDescriptor(location, storageClass=storageClass)) self.ingest(path, ref, formatter=formatter)
def __init__(self, config, registry): super().__init__(config, registry) self.root = self.config['root'] if not os.path.isdir(self.root): if 'create' not in self.config or not self.config['create']: raise ValueError("No valid root at: {0}".format(self.root)) safeMakeDir(self.root) self.locationFactory = LocationFactory(self.root) self.formatterFactory = FormatterFactory() self.storageClassFactory = StorageClassFactory() # Now associate formatters with storage classes for name, f in self.config["formatters"].items(): self.formatterFactory.registerFormatter(name, f) # Read the file naming templates self.templates = FileTemplates(self.config["templates"]) # Name ourselves self.name = "POSIXDatastore@{}".format(self.root) # Somewhere to temporarily store dataset to formatter maps self.internalRegistry = {}
def ingest(self, path, ref, formatter=None, transfer=None): """Add an on-disk file with the given `DatasetRef` to the store, possibly transferring it. The caller is responsible for ensuring that the given (or predicted) Formatter is consistent with how the file was written; `ingest` will in general silently ignore incorrect formatters (as it cannot efficiently verify their correctness), deferring errors until ``get`` is first called on the ingested dataset. Parameters ---------- path : `str` File path. Treated as relative to the repository root if not absolute. ref : `DatasetRef` Reference to the associated Dataset. formatter : `Formatter`, optional Formatter that should be used to retreive the Dataset. If not provided, the formatter will be constructed according to Datastore configuration. Can be a the Formatter class or an instance. transfer : str (optional) If not None, must be one of 'move', 'copy', 'hardlink', or 'symlink' indicating how to transfer the file. The new filename and location will be determined via template substitution, as with ``put``. If the file is outside the datastore root, it must be transferred somehow. Raises ------ RuntimeError Raised if ``transfer is None`` and path is outside the repository root. FileNotFoundError Raised if the file at ``path`` does not exist. FileExistsError Raised if ``transfer is not None`` but a file already exists at the location computed from the template. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. """ # Confirm that we can accept this dataset if not self.constraints.isAcceptable(ref): # Raise rather than use boolean return value. raise DatasetTypeNotSupportedError( f"Dataset {ref} has been rejected by this datastore via" " configuration.") if formatter is None: formatter = self.formatterFactory.getFormatterClass(ref) fullPath = os.path.normpath(os.path.join(self.root, path)) if not os.path.exists(fullPath): raise FileNotFoundError( "File at '{}' does not exist; note that paths to ingest are " "assumed to be relative to self.root unless they are absolute." .format(fullPath)) if transfer is None: if os.path.isabs(path): absRoot = os.path.abspath(self.root) if os.path.commonpath([absRoot, path]) != absRoot: raise RuntimeError( "'{}' is not inside repository root '{}'".format( path, self.root)) path = os.path.relpath(path, absRoot) elif path.startswith(os.path.pardir): raise RuntimeError( f"'{path}' is outside repository root '{self.root}'") else: template = self.templates.getTemplate(ref) location = self.locationFactory.fromPath(template.format(ref)) newPath = formatter.predictPathFromLocation(location) newFullPath = os.path.join(self.root, newPath) if os.path.exists(newFullPath): raise FileExistsError( "File '{}' already exists".format(newFullPath)) storageDir = os.path.dirname(newFullPath) if not os.path.isdir(storageDir): with self._transaction.undoWith("mkdir", os.rmdir, storageDir): safeMakeDir(storageDir) if transfer == "move": with self._transaction.undoWith("move", shutil.move, newFullPath, fullPath): shutil.move(fullPath, newFullPath) elif transfer == "copy": with self._transaction.undoWith("copy", os.remove, newFullPath): shutil.copy(fullPath, newFullPath) elif transfer == "hardlink": with self._transaction.undoWith("hardlink", os.unlink, newFullPath): os.link(fullPath, newFullPath) elif transfer == "symlink": with self._transaction.undoWith("symlink", os.unlink, newFullPath): os.symlink(fullPath, newFullPath) else: raise NotImplementedError( "Transfer type '{}' not supported.".format(transfer)) path = newPath fullPath = newFullPath # Create Storage information in the registry checksum = self.computeChecksum(fullPath) stat = os.stat(fullPath) size = stat.st_size # Update the registry self._register_dataset_file(ref, formatter, path, size, checksum)
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. Raises ------ TypeError Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. Notes ----- If the datastore is configured to reject certain dataset types it is possible that the put will fail and raise a `DatasetTypeNotSupportedError`. The main use case for this is to allow `ChainedDatastore` to put to multiple datastores without requiring that every datastore accepts the dataset. """ datasetType = ref.datasetType storageClass = datasetType.storageClass # Sanity check if not isinstance(inMemoryDataset, storageClass.pytype): raise TypeError("Inconsistency between supplied object ({}) " "and storage class type ({})".format( type(inMemoryDataset), storageClass.pytype)) # Confirm that we can accept this dataset if not self.constraints.isAcceptable(ref): # Raise rather than use boolean return value. raise DatasetTypeNotSupportedError( f"Dataset {ref} has been rejected by this datastore via" " configuration.") # Work out output file name try: template = self.templates.getTemplate(ref) except KeyError as e: raise DatasetTypeNotSupportedError( f"Unable to find template for {ref}") from e location = self.locationFactory.fromPath(template.format(ref)) # Get the formatter based on the storage class try: formatter = self.formatterFactory.getFormatter(ref) except KeyError as e: raise DatasetTypeNotSupportedError( f"Unable to find formatter for {ref}") from e storageDir = os.path.dirname(location.path) if not os.path.isdir(storageDir): with self._transaction.undoWith("mkdir", os.rmdir, storageDir): safeMakeDir(storageDir) # Write the file predictedFullPath = os.path.join(self.root, formatter.predictPath(location)) if os.path.exists(predictedFullPath): raise FileExistsError( f"Cannot write file for ref {ref} as " f"output file {predictedFullPath} already exists") with self._transaction.undoWith("write", os.remove, predictedFullPath): path = formatter.write( inMemoryDataset, FileDescriptor(location, storageClass=storageClass)) assert predictedFullPath == os.path.join(self.root, path) log.debug("Wrote file to %s", path) self.ingest(path, ref, formatter=formatter)
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. Raises ------ TypeError Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. Notes ----- If the datastore is configured to reject certain dataset types it is possible that the put will fail and raise a `DatasetTypeNotSupportedError`. The main use case for this is to allow `ChainedDatastore` to put to multiple datastores without requiring that every datastore accepts the dataset. """ location, formatter = self._prepare_for_put(inMemoryDataset, ref) storageDir = os.path.dirname(location.path) if not os.path.isdir(storageDir): # Never try to remove this after creating it since there might # be a butler ingest process running concurrently that will # already think this directory exists. safeMakeDir(storageDir) # Write the file predictedFullPath = os.path.join(self.root, formatter.predictPath()) if os.path.exists(predictedFullPath): raise FileExistsError( f"Cannot write file for ref {ref} as " f"output file {predictedFullPath} already exists") def _removeFileExists(path): """Remove a file and do not complain if it is not there. This is important since a formatter might fail before the file is written and we should not confuse people by writing spurious error messages to the log. """ try: os.remove(path) except FileNotFoundError: pass formatter_exception = None with self._transaction.undoWith("write", _removeFileExists, predictedFullPath): try: path = formatter.write(inMemoryDataset) log.debug("Wrote file to %s", path) except Exception as e: formatter_exception = e if formatter_exception: raise formatter_exception assert predictedFullPath == os.path.join(self.root, path) info = self._extractIngestInfo(path, ref, formatter=formatter) self._register_datasets([(ref, info)])