def test_box2d_vectorize_iou(): box1 = Box2d(0, 0, 20, 20) assert np.allclose( [1 / 7, 5 * 5 / (2 * 20 * 20 - 5 * 5)], box1.iou([Box2d(10, 10, 30, 30), Box2d(15, 15, 35, 35)]), )
def test_df_to_rikai(spark: SparkSession, tmp_path: Path): df = spark.createDataFrame( [Row(Box2d(1, 2, 3, 4)), Row(Box2d(23, 33, 44, 88))], ["bbox"] ) df_to_rikai(df, str(tmp_path)) actual_df = spark.read.format("rikai").load(str(tmp_path)) assert_count_equal(df.collect(), actual_df.collect())
def test_box2d_iou(): box1 = Box2d(0, 0, 20, 20) box2 = Box2d(10, 10, 30, 30) assert np.isclose(1 / 7, box1.iou(box2)) assert isinstance(box1.iou(box2), float) box3 = Box2d(15, 15, 35, 35) assert np.isclose(5 * 5 / (2 * 20 * 20 - 5 * 5), box1.iou(box3))
def test_bbox(self): df = self.spark.createDataFrame([Row(b=Box2d(1, 2, 3, 4))]) df.write.mode("overwrite").format("rikai").save(self.test_dir) records = self._read_parquets(self.test_dir) self.assertCountEqual([{"b": Box2d(1, 2, 3, 4)}], records)
def test_bbox(spark: SparkSession, tmp_path: Path): test_dir = str(tmp_path) df = spark.createDataFrame([Row(b=Box2d(1, 2, 3, 4))]) df.write.mode("overwrite").format("rikai").save(test_dir) records = _read_parquets(test_dir) assert_count_equal([{"b": Box2d(1, 2, 3, 4)}], records)
def test_areas(spark: SparkSession): """Test calculating bounding box's area.""" df = spark.createDataFrame( [ (Box2d(1, 2, 2.0, 3.0), ), (Box2d(10, 12, 11.0, 17.0), ), ], ["bbox"], ) df = df.withColumn("area", area(col("bbox"))) assert_area_equals([1.0, 5.0], df)
def test_areas(self): """Test calculating bounding box's area.""" df = self.spark.createDataFrame( [ (Box2d(1, 2, 1.0, 1.0), ), (Box2d(10, 12, 1.0, 5.0), ), ], ["bbox"], ) df = df.withColumn("area", area(col("bbox"))) self.assertCountEqual((1.0, 5.0), df.select("area").toPandas()["area"])
def test_coco_dataset( spark: SparkSession, tmp_path: Path, ): dataset_dir = tmp_path / "features" asset_dir = tmp_path / "assets" asset_dir.mkdir(parents=True) data = [] for i in range(10): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = asset_dir / f"{i}.png" data.append( Row( image_id=i, split="train", image=Image.from_array(image_data, image_uri), annotations=[ Row( category_id=123, category_text="car", bbox=Box2d(1, 2, 3, 4), ), Row( category_id=234, category_text="dog", bbox=Box2d(1, 2, 3, 4), ), ], ) ) spark.createDataFrame(data).write.mode("overwrite").format("rikai").save( str(dataset_dir) ) loader = DataLoader(dataset_dir, batch_size=1) example = next(iter(loader)) assert isinstance(example, list) assert 1 == len(example) assert 2 == len(example[0]["annotations"]) assert np.array_equal( np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"] ), f"Actual annotations: {example[0]['annotations'][0]['bbox']}"
def test_box2d_as_list(): box = Box2d(1.0, 2.0, 3.0, 4.0) assert [1.0, 2.0, 3.0, 4.0] == list(box) img = Image.fromarray( np.random.randint(0, 128, size=(32, 32), dtype=np.uint8)) draw = ImageDraw.Draw(img) # Check that the box works with draw. draw.rectangle(box) assert isinstance(box, Sequence)
def test_coco_dataset(self): dataset_dir = os.path.join(self.test_dir, "features") asset_dir = os.path.join(self.test_dir, "assets") os.makedirs(asset_dir) data = [] for i in range(10): image_data = np.random.randint(0, 128, size=(128, 128), dtype=np.uint8) image_uri = os.path.join(asset_dir, f"{i}.png") PILImage.fromarray(image_data).save(image_uri) data.append( Row( image_id=i, split="train", image=Image(image_uri), annotations=[ Row(category_id=123, category_text="car", bbox=Box2d(1, 2, 3, 4)), Row(category_id=234, category_text="dog", bbox=Box2d(1, 2, 3, 4)), ], )) self.spark.createDataFrame(data).write.mode("overwrite").format( "rikai").save(dataset_dir) loader = DataLoader(dataset_dir, batch_size=1) example = next(iter(loader)) self.assertTrue(isinstance(example, list)) self.assertEqual(1, len(example)) self.assertEqual(2, len(example[0]["annotations"])) self.assertTrue( np.array_equal(np.array([1, 2, 3, 4]), example[0]["annotations"][0]["bbox"]))
def test_scale_box2d(): box = Box2d(1.0, 2.0, 3.0, 4.0) for twos in [2, 2.0, np.float32(2), np.float64(2), (2, 2)]: assert Box2d(0.5, 1.0, 1.5, 2.0) == box / twos assert Box2d(2.0, 4.0, 6.0, 8.0) == box * twos assert Box2d(0.5, 0.5, 1.5, 1.0) == box / (2, 4) assert Box2d(0.5, 0.25, 1.5, 0.5) == box / (2.0, 8.0) assert Box2d(10.0, 15.0, 30.0, 30.0) == box * (10, 7.5)
def test_readme_example(spark: SparkSession): df = spark.createDataFrame([{ "id": 1, "mat": DenseMatrix(2, 2, range(4)), "image": Image("s3://foo/bar/1.png"), "annotations": [ Row( label="cat", mask=wrap(np.random.rand(256, 256)), bbox=Box2d(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), ) ], }]) df.show()
def test_bbox(spark, tmp_path): df = spark.createDataFrame( [Row(Box2d(1, 2, 3, 4)), Row(Box2d(23, 33, 44, 88))], ["bbox"]) _check_roundtrip(spark, df, tmp_path)
def test_box2d_empty_iou(): box1 = Box2d(0, 0, 20, 20) assert box1.iou([]).size == 0
def test_bbox(self): df = self.spark.createDataFrame( [Row(Box2d(1, 2, 3, 4)), Row(Box2d(23, 33, 44, 88))], ["bbox"] ) self._check_roundtrip(df)
def convert( spark: SparkSession, dataset_root: str, limit: int = 0, asset_dir: Optional[str] = None, ) -> DataFrame: """Convert a Coco Dataset into Rikai dataset. This function expects the COCO datasets are stored in directory with the following structure: - dataset - annotations - captions_train2017.json - instances_train2017.json - ... - train2017 - val2017 - test2017 Parameters ---------- spark : SparkSession A live spark session dataset_root : str The directory of dataset limit : int, optional The number of images of each split to be converted. asset_dir : str, optional The asset directory to store images, can be a s3 directory. Return ------ DataFrame Returns a Spark DataFrame """ train_json = os.path.join(dataset_root, "annotations", "instances_train2017.json") val_json = os.path.join(dataset_root, "annotations", "instances_val2017.json") categories = load_categories(train_json) examples = [] for split, anno_file in zip(["train", "val"], [train_json, val_json]): coco = COCO(annotation_file=anno_file) # Coco has native dependencies, so we do not distributed them # to the workers. image_ids = coco.imgs if limit > 0: image_ids = islice(image_ids, limit) for image_id in image_ids: ann_id = coco.getAnnIds(imgIds=image_id) annotations = coco.loadAnns(ann_id) annos = [] for ann in annotations: bbox = Box2d(*ann["bbox"]) annos.append({ "category_id": ann["category_id"], "category_text": categories[ann["category_id"]]["name"], "bbox": bbox, "area": float(ann["area"]), }) image_payload = coco.loadImgs(ids=image_id)[0] example = { "image_id": image_id, "annotations": annos, "image": Image( os.path.abspath( os.path.join( os.curdir, "dataset", "{}2017".format(split), image_payload["file_name"], ))), "split": split, } examples.append(example) schema = StructType([ StructField("image_id", LongType(), False), StructField( "annotations", ArrayType( StructType([ StructField("category_id", IntegerType()), StructField("category_text", StringType()), StructField("area", FloatType()), StructField("bbox", Box2dType()), ])), False, ), StructField("image", ImageType(), False), StructField("split", StringType(), False), ]) df = spark.createDataFrame(examples, schema=schema) if asset_dir: asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/" print("ASSET DIR: ", asset_dir) df = df.withColumn("image", image_copy(col("image"), lit(asset_dir))) return df