def test_spark_show_numpy(spark: SparkSession, capsys): data = wrap(np.random.rand(50, 50, 3)) data2 = wrap(np.array([1, 2, 3], dtype=np.uint8)) df = spark.createDataFrame([{"np": data}, {"np": data2}]) df.show() assert np.array_equal(data, df.first().np) stdout = capsys.readouterr().out print(stdout) assert "ndarray(float64" in stdout assert "ndarray(uint8" in stdout
def test_load_dataset(spark: SparkSession, tmp_path: Path): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" array = wrap(np.random.random_sample((3, 4))) data.append({ "id": i, "array": array, "image": Image.from_array(image_data, image_uri), }) expected.append({"id": i, "array": array, "image": image_data}) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) loader = DataLoader(dataset_dir, batch_size=8) actual = [] for examples in loader: # print(examples) assert len(examples) == 8 actual.extend(examples) actual = sorted(actual, key=lambda x: x["id"]) assert len(actual) == 1000 for expect, act in zip(expected, actual): assert np.array_equal(expect["array"], act["array"]) assert np.array_equal(expect["image"], act["image"])
def test_load_dataset(spark: SparkSession, tmp_path: Path): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" array = wrap(np.random.random_sample((3, 4))) data.append( { "id": i, "array": array, "image": Image.from_array(image_data, image_uri), } ) expected.append({"id": i, "array": array, "image": image_data}) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) loader = DataLoader(dataset_dir, batch_size=8) _check_loader(loader, expected) loader2 = DataLoader(df, batch_size=8) _check_loader(loader2, expected)
def test_numpy_to_image(spark: SparkSession, tmp_path: Path): """Test upload a numpy image to the external storage, and convert the data into Image asset. """ df = spark.createDataFrame( [Row(id=1, data=wrap(np.ones((32, 32), dtype=np.uint8)))]) df = df.withColumn( "image", numpy_to_image( df.data, concat(lit(str(tmp_path)), lit("/"), df.id, lit(".png"))), ) df.count() # print(df.first().image) assert Path(df.first().image.uri) == tmp_path / "1.png" assert (tmp_path / "1.png").exists()
def test_readme_example(spark: SparkSession): df = spark.createDataFrame([{ "id": 1, "mat": DenseMatrix(2, 2, range(4)), "image": Image("s3://foo/bar/1.png"), "annotations": [ Row( label="cat", mask=wrap(np.random.rand(256, 256)), bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), ) ], }]) df.show()
def test_torch_dataset(spark, tmp_path, num_workers): total = 1000 dataset_dir = tmp_path / "data" asset_dir = tmp_path / "asset" asset_dir.mkdir(parents=True) data = [] expected = [] for i in range(total): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" Image.from_array(image_data, image_uri), array = wrap(np.random.random_sample((3, 4))) data.append( { "id": i, "array": array, "image": Image(image_uri), } ) expected.append( { "id": i, "array": torch.as_tensor(np.array([array])), "image": torch.as_tensor(np.array([image_data])), } ) df = spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(str(dataset_dir)) dataset = Dataset(dataset_dir) loader = torchDataLoader( dataset, num_workers=num_workers, drop_last=True, ) actual = sorted(list(loader), key=lambda x: x["id"]) assert len(actual) == total for expect, act in zip(expected, actual): assert torch.equal( expect["array"], act["array"] ), f"Expected {expect['array']} got {act['array']}" assert torch.equal(expect["image"], act["image"])
def test_load_dataset(self): dataset_dir = os.path.join(self.test_dir, "features") asset_dir = os.path.join(self.test_dir, "assets") os.makedirs(asset_dir) expected = [] data = [] for i in range(1000): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = os.path.join(asset_dir, f"{i}.png") PILImage.fromarray(image_data).save(image_uri) array = wrap(np.random.random_sample((3, 4))) data.append({ "id": i, "array": array, "image": Image(image_uri), }) expected.append({"id": i, "array": array, "image": image_data}) df = self.spark.createDataFrame(data) df.write.mode("overwrite").format("rikai").save(dataset_dir) loader = DataLoader(dataset_dir, batch_size=8) actual = [] for examples in loader: # print(examples) self.assertEqual(8, len(examples)) actual.extend(examples) actual = sorted(actual, key=lambda x: x["id"]) self.assertEqual(1000, len(actual)) for expect, act in zip(expected, actual): self.assertTrue(np.array_equal(expect["array"], act["array"])) self.assertTrue(np.array_equal(expect["image"], act["image"]))