class TestDataset(TestCase): def setUp(self): np.random.seed(123) self.tmpdir = pathlib.Path(__file__).parent / "tmp_test_dataset" self.tmpdir.mkdir(exist_ok=True) self.n = 100 self.n_atoms = np.random.randint(1, high=10, size=self.n) r = [2 * np.random.random((na, 3)) for na in self.n_atoms] self.r = np.array(r, dtype=object) self.z = np.array( [np.random.randint(1, high=10, size=na) for na in self.n_atoms], dtype=object) self.b = np.random.random((self.n, 3, 3)) self.p1 = np.random.random(self.n) self.p2 = np.random.random(self.n) self.splits = np.array( [[ np.random.randint(0, high=self.n, size=80), np.random.randint(0, high=self.n, size=80), ] for i in range(3)], dtype=object, ) self.data = Dataset( z=self.z, r=self.r, b=self.b, p={ "p1": self.p1, "p2": self.p2 }, name="test", desc="test", splits=self.splits, ) self.data_nocell = Dataset( z=self.z, r=self.r, p={ "p1": self.p1, "p2": self.p2 }, name="test", desc="test", splits=self.splits, ) self.data2 = Dataset( z=self.z, r=self.r, b=self.b, p={ "p1": self.p1, "p2": self.p2 }, name="test2", desc="test2", ) self.data_nop = Dataset(z=self.z, r=self.r, b=self.b, p={}, name="test", desc="test") # roll a new dataset n = 100 n_atoms = np.random.randint(1, high=10, size=n) r = [2 * np.random.random((na, 3)) for na in n_atoms] r = np.array(r, dtype=object) z = np.array( [np.random.randint(1, high=10, size=na) for na in n_atoms], dtype=object) b = np.random.random((n, 3, 3)) p1 = np.random.random(n) p2 = np.random.random(n) self.different = Dataset(z=z, r=r, b=b, p={ "p1": p1, "p2": p2 }, name="test_different", desc="test!") def tearDown(self): shutil.rmtree(self.tmpdir) def test_report(self): # smoke test self.data.report def test_creation(self): self.assertEqual(self.data.name, "test") self.assertEqual(self.data.desc, "test") np.testing.assert_array_equal(self.data.z, self.z) np.testing.assert_array_equal(self.data.r, self.r) np.testing.assert_array_equal(self.data.b, self.b) np.testing.assert_array_equal(self.data.p["p1"], self.p1) np.testing.assert_array_equal(self.data.p["p2"], self.p2) np.testing.assert_array_equal(self.data.splits, self.splits) def test_wrong_creation(self): with self.assertRaises(AssertionError): Dataset(z=self.data.z, r=self.data.r, p={"lol": [1]}) with self.assertRaises(AssertionError): Dataset(z=[np.zeros(len(self.data.r[0]))], r=self.data.r) def test_hash_stable(self): # is the dataset hash stable across restarts? self.assertEqual(self.data.hash, "fd404c7fe4c285112cb7719c6913dc3b") def test_hash_equal(self): self.assertEqual(self.data.hash, self.data2.hash) self.assertEqual(self.data.geom_hash, self.data2.geom_hash) self.assertEqual(self.data.geom_hash, self.data_nop.geom_hash) self.assertTrue(self.data.hash is not None) self.assertTrue(self.data.geom_hash is not None) self.assertNotEqual(self.data.hash, self.different.hash) def test_roundtrip(self): self.data.save(directory=self.tmpdir) data3 = load_dataset("test", other_paths=[self.tmpdir]) print(self.tmpdir) self.assertEqual(self.data.hash, data3.hash) self.assertEqual(self.data.name, data3.name) self.assertEqual(self.data.desc, data3.desc) np.testing.assert_array_equal(data3.splits, self.splits) def test_subset(self): idx = np.array([3, 1, 5, 6, 28, 32, 11], dtype=int) subset = Subset.from_dataset(self.data, idx=idx, name="subset") for i, index in enumerate(idx): np.testing.assert_array_equal(subset.z[i], self.data.z[index]) np.testing.assert_array_equal(subset.b[i], self.data.b[index]) np.testing.assert_array_equal(subset.r[i], self.data.r[index]) self.assertEqual(subset.n, len(idx)) # saving roundtrip test subset.save(directory=self.tmpdir) subset2 = load_dataset("subset", other_paths=[self.tmpdir]) self.assertEqual(subset.hash, subset2.hash) # hash stability test self.assertEqual(subset.hash, "17049f531ac6f4aa091c79b06e352254") def test_chunking(self): for i, s in enumerate(self.data.in_chunks(size=30)): if i == 0: self.assertEqual(s.n, 30) np.testing.assert_array_equal(s.b, self.data.b[0:30]) elif i == 1: self.assertEqual(s.n, 30) np.testing.assert_array_equal(s.b, self.data.b[30:60]) elif i == 2: self.assertEqual(s.n, 30) np.testing.assert_array_equal(s.b, self.data.b[60:90]) else: self.assertEqual(s.n, 10) np.testing.assert_array_equal(s.b, self.data.b[90:100]) def test_ase(self): atoms = self.data.as_Atoms() for i, a in enumerate(atoms): np.testing.assert_array_equal(a.get_cell(), self.data.b[i]) np.testing.assert_array_equal(a.get_positions(), self.data.r[i]) np.testing.assert_array_equal(a.get_atomic_numbers(), self.data.z[i]) dataset = Dataset.from_Atoms(atoms, p=self.data.p) self.assertEqual(dataset.geom_hash, self.data.geom_hash) for p in self.data.p.keys(): np.testing.assert_array_equal(self.data.p[p], dataset.p[p]) with self.assertRaises(AssertionError): atoms2 = copy(atoms) atoms2[0].set_pbc(False) Dataset.from_Atoms(atoms2) with self.assertRaises(AssertionError): atoms3 = copy(atoms) atoms3[0].set_pbc([False, True, False]) Dataset.from_Atoms(atoms3) atoms = self.data_nocell.as_Atoms() for i, a in enumerate(atoms): np.testing.assert_array_equal(a.get_positions(), self.data_nocell.r[i]) np.testing.assert_array_equal(a.get_atomic_numbers(), self.data_nocell.z[i]) dataset = Dataset.from_Atoms(atoms) self.assertEqual(dataset.geom_hash, self.data_nocell.geom_hash)
data = Dataset('kaggle', z, r, b, { 'fe': fe, 'fepa': fepa, 'fecreal': fecreal, 'n_atoms': n_atoms, 'n_sub': n_sub, 'sg': sg }, desc=desc, family='tco') # And save. Bam! data.save() # Let's now create a model building and validation split np.random.seed(2312) rest, build = cmli.twoway_split(data.n, 2000) sub1 = Subset(data, build, name='build', desc='Randomly picked subset of 2000 structures') sub2 = Subset( data, rest, name='rest', desc=