Python ParquetDataset 예제들, zoo.orca.data.image.parquet_dataset.ParquetDataset Python 예제들

예제 #1

0

파일 보기

    def test_parquet_images_training(self):
        from zoo.orca.learn.tf2 import Estimator
        temp_dir = tempfile.mkdtemp()
        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            output_shapes = {"id": (), "image": (), "label": ()}

            def data_creator(config, batch_size):
                dataset = read_parquet("tf_dataset",
                                       input_path=path,
                                       output_types=output_types,
                                       output_shapes=output_shapes)
                dataset = dataset.shuffle(10)
                dataset = dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
                dataset = dataset.map(parse_data_train)
                dataset = dataset.batch(batch_size)
                return dataset

            ray_ctx = RayContext.get()
            trainer = Estimator.from_keras(model_creator=model_creator)
            trainer.fit(data=data_creator, epochs=1, batch_size=2)
        finally:
            shutil.rmtree(temp_dir)

예제 #2

0

파일 보기

    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir,
                                 images_generator(),
                                 images_schema,
                                 block_size=4)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   input_path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

            dataloader = read_parquet("dataloader", input_path=path)
            cur_dl = iter(dataloader)
            while True:
                try:
                    print(next(cur_dl)['label'])
                except StopIteration:
                    break

        finally:
            shutil.rmtree(temp_dir)

예제 #3

0

파일 보기

def test_write_parquet_images(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator():
        dataset_path = os.path.join(resource_path, "cat_dog")
        for root, dirs, files in os.walk(os.path.join(dataset_path, "cats")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 1, "id": image_path}

        for root, dirs, files in os.walk(os.path.join(dataset_path, "dogs")):
            for name in files:
                image_path = os.path.join(root, name)
                yield {"image": image_path, "label": 0, "id": image_path}

    schema = {
        "image": SchemaField(feature_type=FeatureType.IMAGE, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,)),
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.STRING, shape=())
    }

    try:
        ParquetDataset.write("file://" + temp_dir, generator(), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        image_path = data['id']
        with open(image_path, "rb") as f:
            image_bytes = f.read()

        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)

예제 #4

0

파일 보기

def test_write_parquet_simple(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    def generator(num):
        for i in range(num):
            yield {"id": i, "feature": np.zeros((10,)), "label": np.ones((4,))}

    schema = {
        "id": SchemaField(feature_type=FeatureType.SCALAR, dtype=DType.INT32, shape=()),
        "feature": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(10,)),
        "label": SchemaField(feature_type=FeatureType.NDARRAY, dtype=DType.FLOAT32, shape=(4,))
    }

    try:

        ParquetDataset.write("file://" + temp_dir, generator(100), schema)
        data, schema = ParquetDataset._read_as_dict_rdd("file://" + temp_dir)
        data = data.collect()[0]
        assert data['id'] == 0
        assert np.all(data['feature'] == np.zeros((10,), dtype=np.float32))
        assert np.all(data['label'] == np.ones((4,), dtype=np.float32))

    finally:
        shutil.rmtree(temp_dir)

예제 #5

0

파일 보기

파일: test_read_parquet_images.py 프로젝트: yangw1234/analytics-zoo

    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir,
                                 images_generator(),
                                 images_schema,
                                 block_size=4)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

            num_shards, rank = 3, 1
            dataset_shard = read_parquet("tf_dataset",
                                         path=path,
                                         config={
                                             "num_shards": num_shards,
                                             "rank": rank
                                         },
                                         output_types=output_types)
            assert len(list(dataset_shard)) <= len(list(dataset)) // num_shards, \
                "len of dataset_shard should be 1/`num_shards` of the whole dataset."

            dataloader = read_parquet("dataloader", path=path)
            dataloader_shard = read_parquet("dataloader",
                                            path=path,
                                            config={
                                                "num_shards": num_shards,
                                                "rank": rank
                                            })
            cur_dl = iter(dataloader_shard)
            cur_count = 0
            while True:
                try:
                    print(next(cur_dl)['label'])
                    cur_count += 1
                except StopIteration:
                    break
            assert cur_count == len(list(dataset_shard))
        finally:
            shutil.rmtree(temp_dir)

예제 #6

0

파일 보기

파일: test_write_parquet.py 프로젝트: DingHe/analytics-zoo

def test_write_mnist(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    try:
        train_image_file = os.path.join(temp_dir, "train-images")
        train_label_file = os.path.join(temp_dir, "train-labels")
        output_path = os.path.join(temp_dir, "output_dataset")

        images = np.array([[i] * 16 for i in range(20)]).reshape(
            (20, 4, 4)).astype(np.uint8)
        labels = np.array(list(range(20))).reshape((20, )).astype(np.uint8)

        _images_to_mnist_file(images, train_image_file)
        _labels_to_mnist_file(labels, train_label_file)

        write_mnist(image_file=train_image_file,
                    label_file=train_label_file,
                    output_path=output_path)
        data, schema = ParquetDataset._read_as_dict_rdd(output_path)
        data = data.sortBy(lambda x: x['label']).collect()
        images_load = np.reshape(np.stack([d['image'] for d in data]),
                                 (-1, 4, 4))
        labels_load = np.stack([d['label'] for d in data])

        assert np.all(images_load == images)
        assert np.all(labels_load == labels_load)

    finally:
        shutil.rmtree(temp_dir)

예제 #7

0

파일 보기

    def test_read_parquet_images_tf_dataset(self):
        temp_dir = tempfile.mkdtemp()

        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            dataset = read_parquet("tf_dataset",
                                   input_path=path,
                                   output_types=output_types)
            for dt in dataset.take(1):
                print(dt.keys())

        finally:
            shutil.rmtree(temp_dir)

예제 #8

0

파일 보기

파일: test_write_parquet.py 프로젝트: DingHe/analytics-zoo

def test_train_simple(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()

    try:
        _write_ndarrays(images=np.random.randn(500, 28, 28,
                                               1).astype(np.float32),
                        labels=np.random.randint(0, 10,
                                                 (500, )).astype(np.int32),
                        output_path=temp_dir)
        dataset = ParquetDataset.read_as_tf(temp_dir)

        def preprocess(data):
            return data['image'], data["label"]

        dataset = dataset.map(preprocess)

        import tensorflow as tf
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(20,
                                   kernel_size=(5, 5),
                                   strides=(1, 1),
                                   activation='tanh',
                                   input_shape=(28, 28, 1),
                                   padding='valid'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                         strides=(2, 2),
                                         padding='valid'),
            tf.keras.layers.Conv2D(50,
                                   kernel_size=(5, 5),
                                   strides=(1, 1),
                                   activation='tanh',
                                   padding='valid'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                         strides=(2, 2),
                                         padding='valid'),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(500, activation='tanh'),
            tf.keras.layers.Dense(10, activation='softmax'),
        ])

        model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        est = Estimator.from_keras(keras_model=model)
        est.fit(data=dataset, batch_size=100, epochs=1)

    finally:
        shutil.rmtree(temp_dir)

예제 #9

0

파일 보기

파일: test_write_parquet.py 프로젝트: DingHe/analytics-zoo

def test_write_from_directory(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()
    try:
        label_map = {"cats": 0, "dogs": 1}
        write_from_directory(os.path.join(resource_path, "cat_dog"), label_map,
                             temp_dir)
        train_xshard = ParquetDataset._read_as_xshards(temp_dir)

        data = train_xshard.collect()[0]
        image_path = data["image_id"][0]

        with open(image_path, "rb") as f:
            image_bytes = f.read()

        assert image_bytes == data['image'][0]

    finally:
        shutil.rmtree(temp_dir)

예제 #10

0

파일 보기

def test_write_voc(orca_context_fixture):
    sc = orca_context_fixture
    temp_dir = tempfile.mkdtemp()
    try:
        from zoo.orca.data import SparkXShards
        dataset_path = os.path.join(resource_path, "VOCdevkit")
        output_path = os.path.join(temp_dir, "output_dataset")
        write_voc(dataset_path, splits_names=[(2007, "trainval")],
                  output_path="file://" + output_path)

        data, schema = ParquetDataset._read_as_dict_rdd("file://" + output_path)
        data = data.collect()[0]
        image_path = data["image_id"]
        with open(image_path, "rb") as f:
            image_bytes = f.read()
        assert image_bytes == data['image']

    finally:
        shutil.rmtree(temp_dir)