Пример #1
0
    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir,
                                 images_generator(),
                                 images_schema,
                                 block_size=4)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   input_path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

            dataloader = read_parquet("dataloader", input_path=path)
            cur_dl = iter(dataloader)
            while True:
                try:
                    print(next(cur_dl)['label'])
                except StopIteration:
                    break

        finally:
            shutil.rmtree(temp_dir)
Пример #2
0
    def test_parquet_images_training(self):
        from zoo.orca.learn.tf2 import Estimator
        temp_dir = tempfile.mkdtemp()
        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            output_shapes = {"id": (), "image": (), "label": ()}

            def data_creator(config, batch_size):
                dataset = read_parquet("tf_dataset",
                                       input_path=path,
                                       output_types=output_types,
                                       output_shapes=output_shapes)
                dataset = dataset.shuffle(10)
                dataset = dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
                dataset = dataset.map(parse_data_train)
                dataset = dataset.batch(batch_size)
                return dataset

            ray_ctx = RayContext.get()
            trainer = Estimator.from_keras(model_creator=model_creator)
            trainer.fit(data=data_creator, epochs=1, batch_size=2)
        finally:
            shutil.rmtree(temp_dir)
Пример #3
0
def test_write_parquet_images(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator():
        dataset_path = os.path.join(resource_path, "cat_dog")
        for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 1, "id": image_path}

        for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 0, "id": image_path}

    schema = {
        "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)),
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())
    }

    try:
        ParquetDataset.write("file://" + temp_dir, generator(), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        image_path = data['id']
        with open(image_path, "rb") as f:
            image_bytes = f.read()

        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)
Пример #4
0
def test_write_parquet_simple(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator(num):
        for i in range(num):
            yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))}

    schema = {
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()),
        "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,))
    }

    try:

        ParquetDataset.write("file://" + temp_dir, generator(100), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        assert data['id'] == 0
        assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32))
        assert np.all(data['label'] == np.ones((4,), dtype=np.float32))

    finally:
        shutil.rmtree(temp_dir)
    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir,
                                 images_generator(),
                                 images_schema,
                                 block_size=4)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

            num_shards, rank = 3, 1
            dataset_shard = read_parquet("tf_dataset",
                                         path=path,
                                         config={
                                             "num_shards": num_shards,
                                             "rank": rank
                                         },
                                         output_types=output_types)
            assert len(list(dataset_shard)) <= len(list(dataset)) // num_shards, \
                "len of dataset_shard should be 1/`num_shards` of the whole dataset."

            dataloader = read_parquet("dataloader", path=path)
            dataloader_shard = read_parquet("dataloader",
                                            path=path,
                                            config={
                                                "num_shards": num_shards,
                                                "rank": rank
                                            })
            cur_dl = iter(dataloader_shard)
            cur_count = 0
            while True:
                try:
                    print(next(cur_dl)['label'])
                    cur_count += 1
                except StopIteration:
                    break
            assert cur_count == len(list(dataset_shard))
        finally:
            shutil.rmtree(temp_dir)
Пример #6
0
    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   input_path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

        finally:
            shutil.rmtree(temp_dir)