def registerDatasetTypes(datasetTypeName, dimensions, storageClass, registry): """Bulk register DatasetTypes """ datasetType = DatasetType(datasetTypeName, dimensions, storageClass) registry.registerDatasetType(datasetType) for compName, compStorageClass in storageClass.components.items(): compType = DatasetType(datasetType.componentTypeName(compName), dimensions, compStorageClass) registry.registerDatasetType(compType)
class ParquetFormatterTestCase(unittest.TestCase): """Tests for ParquetFormatter, using PosixDatastore. """ def setUp(self): """Create a new butler root for each test.""" self.root = tempfile.mkdtemp(dir=TESTDIR) Butler.makeRepo(self.root) self.butler = Butler(self.root, run="test_run") # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions) self.butler.registry.registerDatasetType(self.datasetType) def tearDown(self): if os.path.exists(self.root): shutil.rmtree(self.root, ignore_errors=True) def testSingleIndexDataFrame(self): columns1 = pd.Index(["a", "b", "c"]) df1 = pd.DataFrame(np.random.randn(5, 3), index=np.arange(5, dtype=int), columns=columns1) self.butler.put(df1, self.datasetType, dataId={}) # Read the whole DataFrame. df2 = self.butler.get(self.datasetType, dataId={}) self.assertTrue(df1.equals(df2)) # Read just the column descriptions. columns2 = self.butler.get( self.datasetType.componentTypeName("columns"), dataId={}) self.assertTrue(df1.columns.equals(columns2)) # Read just some columns a few different ways. df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) self.assertTrue(df1.loc[:, ["a"]].equals(df4)) # Passing an unrecognized column should be a ValueError. with self.assertRaises(ValueError): self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) def testMultiIndexDataFrame(self): columns1 = pd.MultiIndex.from_tuples( [ ("g", "a"), ("g", "b"), ("g", "c"), ("r", "a"), ("r", "b"), ("r", "c"), ], names=["filter", "column"], ) df1 = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns1) self.butler.put(df1, self.datasetType, dataId={}) # Read the whole DataFrame. df2 = self.butler.get(self.datasetType, dataId={}) self.assertTrue(df1.equals(df2)) # Read just the column descriptions. columns2 = self.butler.get( self.datasetType.componentTypeName("columns"), dataId={}) self.assertTrue(df1.columns.equals(columns2)) # Read just some columns a few different ways. df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": { "filter": "g" }}) self.assertTrue(df1.loc[:, ["g"]].equals(df3)) df4 = self.butler.get( self.datasetType, dataId={}, parameters={"columns": { "filter": ["r"], "column": "a" }}) self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) # Passing an unrecognized column should be a ValueError. with self.assertRaises(ValueError): self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]})