def test_s2s_forecaster_xshard_input(self):
        train_data, val_data, test_data = create_data()
        print("original", train_data[0].dtype)
        init_orca_context(cores=4, memory="2g")
        from bigdl.orca.data import XShards

        def transform_to_dict(data):
            return {'x': data[0], 'y': data[1]}

        def transform_to_dict_x(data):
            return {'x': data[0]}

        train_data = XShards.partition(train_data).transform_shard(
            transform_to_dict)
        val_data = XShards.partition(val_data).transform_shard(
            transform_to_dict)
        test_data = XShards.partition(test_data).transform_shard(
            transform_to_dict_x)
        for distributed in [True, False]:
            forecaster = Seq2SeqForecaster(past_seq_len=24,
                                           future_seq_len=5,
                                           input_feature_num=1,
                                           output_feature_num=1,
                                           loss="mae",
                                           lr=0.01,
                                           distributed=distributed)
            forecaster.fit(train_data, epochs=2)
            distributed_pred = forecaster.predict(test_data)
            distributed_eval = forecaster.evaluate(val_data)
        stop_orca_context()
Пример #2
0
    def test_partition_ndarray_with_num_shards_specification(self):
        data = np.random.randn(10, 4)
        # Reasonable number of shards
        xshards = XShards.partition(data, num_shards=2)

        data_parts = xshards.rdd.collect()

        reconstructed = np.concatenate(data_parts)
        assert np.allclose(data, reconstructed)
        # Empty shards
        with pytest.raises(ValueError) as errorInfo:
            xshards = XShards.partition(data, num_shards=20)

        assert errorInfo.type == ValueError
        assert "number of shards" in str(errorInfo.value)
    def test_predict_xshards(self):
        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100, ))
        })
        expected = train_data_shard.collect()

        expected = [shard["x"] for shard in expected]

        for x in expected:
            print(x.shape)

        expected = np.concatenate(expected)

        config = {}
        trainer = Estimator.from_keras(model_creator=identity_model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        result_shards = trainer.predict(train_data_shard,
                                        batch_size=10).collect()

        result = [shard["prediction"] for shard in result_shards]
        expected_result = [shard["x"] for shard in result_shards]

        result = np.concatenate(result)

        assert np.allclose(expected, result)
Пример #4
0
 def load(self, model_path, minPartitions=None):
     """
     restore model from model file and config.
     :param model_path: the model file
     :return: the restored model
     """
     self.internal = XShards.load_pickle(model_path, minPartitions=minPartitions)
    def test_sparkxshards_with_inbalanced_data(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        def random_pad(data):
            import numpy as np
            import random
            times = random.randint(1, 10)
            data["x"] = np.concatenate([data["x"]] * times)
            data["y"] = np.concatenate([data["y"]] * times)
            return data

        train_data_shard = train_data_shard.transform_shard(random_pad)

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
Пример #6
0
def np_to_xshard(x, prefix="x"):
    x = XShards.partition(x)

    def transform_to_dict(train_data):
        return {prefix: train_data}

    return x.transform_shard(transform_to_dict)
Пример #7
0
    def test_partition_ndarray(self):

        data = np.random.randn(10, 4)

        xshards = XShards.partition(data)

        data_parts = xshards.rdd.collect()

        reconstructed = np.concatenate(data_parts)
        assert np.allclose(data, reconstructed)
Пример #8
0
def get_ray_xshards():
    from bigdl.orca.data import XShards
    import numpy as np

    ndarray_dict = {"x": np.random.randn(10, 4), "y": np.random.randn(10, 4)}

    spark_xshards = XShards.partition(ndarray_dict)

    ray_xshards = RayXShards.from_spark_xshards(spark_xshards)

    return ray_xshards, ndarray_dict
Пример #9
0
    def test_partition_nested_with_num_shards_specification(self):
        data1 = np.random.randn(10, 4)
        data2 = np.random.randn(10, 4)
        # Reasonable number of shards
        xshards = XShards.partition({"x": (data1, ), "y": [data2]}, num_shards=2)

        data_parts = xshards.rdd.collect()

        data1_parts = [part["x"][0] for part in data_parts]
        data2_parts = [part["y"][0] for part in data_parts]

        reconstructed1 = np.concatenate(data1_parts)
        reconstructed2 = np.concatenate(data2_parts)
        assert np.allclose(data1, reconstructed1)
        assert np.allclose(data2, reconstructed2)
        # Empty shards
        with pytest.raises(ValueError) as errorInfo:
            xshards = XShards.partition({"x": (data1, ), "y": [data2]}, num_shards=20)

        assert errorInfo.type == ValueError
        assert "number of shards" in str(errorInfo.value)
Пример #10
0
    def test_partition_nested(self):
        data1 = np.random.randn(10, 4)
        data2 = np.random.randn(10, 4)

        xshards = XShards.partition({"x": (data1, ), "y": [data2]})

        data_parts = xshards.rdd.collect()

        data1_parts = [part["x"][0] for part in data_parts]
        data2_parts = [part["y"][0] for part in data_parts]

        reconstructed1 = np.concatenate(data1_parts)
        reconstructed2 = np.concatenate(data2_parts)
        assert np.allclose(data1, reconstructed1)
        assert np.allclose(data2, reconstructed2)
Пример #11
0
    def test_partition_list(self):
        data1 = np.random.randn(10, 4)
        data2 = np.random.randn(10, 4)

        xshards = XShards.partition([data1, data2])

        data_parts = xshards.rdd.collect()

        data1_parts = [part[0] for part in data_parts]
        data2_parts = [part[1] for part in data_parts]

        reconstructed1 = np.concatenate(data1_parts)
        reconstructed2 = np.concatenate(data2_parts)
        assert np.allclose(data1, reconstructed1)
        assert np.allclose(data2, reconstructed2)
Пример #12
0
    def test_sparkxshards(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
Пример #13
0
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

train_pos = np.zeros((len(x_train), max_len), dtype=np.int32)
val_pos = np.zeros((len(x_test), max_len), dtype=np.int32)
for i in range(0, len(x_train)):
    train_pos[i, :] = np.arange(max_len)
    val_pos[i, :] = np.arange(max_len)

train_dataset = XShards.partition({
    "x": (x_train, train_pos),
    "y": np.array(y_train)
})
val_dataset = XShards.partition({
    "x": (x_test, val_pos),
    "y": np.array(y_test)
})

token_shape = (max_len, )
position_shape = (max_len, )
token_input = Input(shape=token_shape)
position_input = Input(shape=position_shape)
O_seq = TransformerLayer.init(vocab=max_features,
                              hidden_size=128,
                              n_head=8,
                              seq_len=max_len)([token_input, position_input])
# Select the first output of the Transformer. The second is the pooled output.
Пример #14
0
def main(cluster_mode, max_epoch, file_path, batch_size, platform,
         non_interactive):
    import matplotlib
    if not non_interactive and platform == "mac":
        matplotlib.use('qt5agg')

    if cluster_mode == "local":
        init_orca_context(cluster_mode="local", cores=4, memory="3g")
    elif cluster_mode == "yarn":
        init_orca_context(cluster_mode="yarn-client",
                          num_nodes=2,
                          cores=2,
                          driver_memory="3g")
    elif cluster_mode == "spark-submit":
        init_orca_context(cluster_mode="spark-submit")
    load_data(file_path)
    img_dir = os.path.join(file_path, "train")
    label_dir = os.path.join(file_path, "train_masks")

    # Here we only take the first 1000 files for simplicity
    df_train = pd.read_csv(os.path.join(file_path, 'train_masks.csv'))
    ids_train = df_train['img'].map(lambda s: s.split('.')[0])
    ids_train = ids_train[:1000]

    x_train_filenames = []
    y_train_filenames = []
    for img_id in ids_train:
        x_train_filenames.append(os.path.join(img_dir,
                                              "{}.jpg".format(img_id)))
        y_train_filenames.append(
            os.path.join(label_dir, "{}_mask.gif".format(img_id)))

    x_train_filenames, x_val_filenames, y_train_filenames, y_val_filenames = \
        train_test_split(x_train_filenames, y_train_filenames, test_size=0.2, random_state=42)

    def load_and_process_image(path):
        array = mpimg.imread(path)
        result = np.array(Image.fromarray(array).resize(size=(128, 128)))
        result = result.astype(float)
        result /= 255.0
        return result

    def load_and_process_image_label(path):
        array = mpimg.imread(path)
        result = np.array(Image.fromarray(array).resize(size=(128, 128)))
        result = np.expand_dims(result[:, :, 1], axis=-1)
        result = result.astype(float)
        result /= 255.0
        return result

    train_images = np.stack(
        [load_and_process_image(filepath) for filepath in x_train_filenames])
    train_label_images = np.stack([
        load_and_process_image_label(filepath)
        for filepath in y_train_filenames
    ])
    val_images = np.stack(
        [load_and_process_image(filepath) for filepath in x_val_filenames])
    val_label_images = np.stack([
        load_and_process_image_label(filepath) for filepath in y_val_filenames
    ])
    train_shards = XShards.partition({
        "x": train_images,
        "y": train_label_images
    })
    val_shards = XShards.partition({"x": val_images, "y": val_label_images})

    # Build the U-Net model
    def conv_block(input_tensor, num_filters):
        encoder = layers.Conv2D(num_filters, (3, 3),
                                padding='same')(input_tensor)
        encoder = layers.Activation('relu')(encoder)
        encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(encoder)
        encoder = layers.Activation('relu')(encoder)
        return encoder

    def encoder_block(input_tensor, num_filters):
        encoder = conv_block(input_tensor, num_filters)
        encoder_pool = layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder)

        return encoder_pool, encoder

    def decoder_block(input_tensor, concat_tensor, num_filters):
        decoder = layers.Conv2DTranspose(num_filters, (2, 2),
                                         strides=(2, 2),
                                         padding='same')(input_tensor)
        decoder = layers.concatenate([concat_tensor, decoder], axis=-1)
        decoder = layers.Activation('relu')(decoder)
        decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)
        decoder = layers.Activation('relu')(decoder)
        decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)
        decoder = layers.Activation('relu')(decoder)
        return decoder

    inputs = layers.Input(shape=(128, 128, 3))  # 128
    encoder0_pool, encoder0 = encoder_block(inputs, 16)  # 64
    encoder1_pool, encoder1 = encoder_block(encoder0_pool, 32)  # 32
    encoder2_pool, encoder2 = encoder_block(encoder1_pool, 64)  # 16
    encoder3_pool, encoder3 = encoder_block(encoder2_pool, 128)  # 8
    center = conv_block(encoder3_pool, 256)  # center
    decoder3 = decoder_block(center, encoder3, 128)  # 16
    decoder2 = decoder_block(decoder3, encoder2, 64)  # 32
    decoder1 = decoder_block(decoder2, encoder1, 32)  # 64
    decoder0 = decoder_block(decoder1, encoder0, 16)  # 128
    outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(decoder0)

    net = models.Model(inputs=[inputs], outputs=[outputs])

    # Define custom metrics
    def dice_coeff(y_true, y_pred):
        smooth = 1.
        # Flatten
        y_true_f = tf.reshape(y_true, [-1])
        y_pred_f = tf.reshape(y_pred, [-1])
        intersection = tf.reduce_sum(y_true_f * y_pred_f)
        score = (2. * intersection + smooth) / \
                (tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth)
        return score

    # Define custom loss function
    def dice_loss(y_true, y_pred):
        loss = 1 - dice_coeff(y_true, y_pred)
        return loss

    def bce_dice_loss(y_true, y_pred):
        loss = losses.binary_crossentropy(y_true, y_pred) + dice_loss(
            y_true, y_pred)
        return loss

    # compile model
    net.compile(optimizer=tf.keras.optimizers.Adam(2e-3), loss=bce_dice_loss)
    print(net.summary())

    # create an estimator from keras model
    est = Estimator.from_keras(keras_model=net)
    # fit with estimator
    est.fit(data=train_shards, batch_size=batch_size, epochs=max_epoch)
    # evaluate with estimator
    result = est.evaluate(val_shards)
    print(result)
    # predict with estimator
    val_shards.cache()
    val_image_shards = val_shards.transform_shard(
        lambda val_dict: {"x": val_dict["x"]})
    pred_shards = est.predict(data=val_image_shards, batch_size=batch_size)
    pred = pred_shards.collect()[0]["prediction"]
    val_image_label = val_shards.collect()[0]
    val_image = val_image_label["x"]
    val_label = val_image_label["y"]
    if not non_interactive:
        # visualize 5 predicted results
        plt.figure(figsize=(10, 20))
        for i in range(5):
            img = val_image[i]
            label = val_label[i]
            predicted_label = pred[i]

            plt.subplot(5, 3, 3 * i + 1)
            plt.imshow(img)
            plt.title("Input image")

            plt.subplot(5, 3, 3 * i + 2)
            plt.imshow(label[:, :, 0], cmap='gray')
            plt.title("Actual Mask")
            plt.subplot(5, 3, 3 * i + 3)
            plt.imshow(predicted_label, cmap='gray')
            plt.title("Predicted Mask")
        plt.suptitle("Examples of Input Image, Label, and Prediction")

        plt.show()

    stop_orca_context()