def test_s2s_forecaster_xshard_input(self): train_data, val_data, test_data = create_data() print("original", train_data[0].dtype) init_orca_context(cores=4, memory="2g") from bigdl.orca.data import XShards def transform_to_dict(data): return {'x': data[0], 'y': data[1]} def transform_to_dict_x(data): return {'x': data[0]} train_data = XShards.partition(train_data).transform_shard( transform_to_dict) val_data = XShards.partition(val_data).transform_shard( transform_to_dict) test_data = XShards.partition(test_data).transform_shard( transform_to_dict_x) for distributed in [True, False]: forecaster = Seq2SeqForecaster(past_seq_len=24, future_seq_len=5, input_feature_num=1, output_feature_num=1, loss="mae", lr=0.01, distributed=distributed) forecaster.fit(train_data, epochs=2) distributed_pred = forecaster.predict(test_data) distributed_eval = forecaster.evaluate(val_data) stop_orca_context()
def test_partition_ndarray_with_num_shards_specification(self): data = np.random.randn(10, 4) # Reasonable number of shards xshards = XShards.partition(data, num_shards=2) data_parts = xshards.rdd.collect() reconstructed = np.concatenate(data_parts) assert np.allclose(data, reconstructed) # Empty shards with pytest.raises(ValueError) as errorInfo: xshards = XShards.partition(data, num_shards=20) assert errorInfo.type == ValueError assert "number of shards" in str(errorInfo.value)
def test_predict_xshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100, )) }) expected = train_data_shard.collect() expected = [shard["x"] for shard in expected] for x in expected: print(x.shape) expected = np.concatenate(expected) config = {} trainer = Estimator.from_keras(model_creator=identity_model_creator, verbose=True, config=config, workers_per_node=2) result_shards = trainer.predict(train_data_shard, batch_size=10).collect() result = [shard["prediction"] for shard in result_shards] expected_result = [shard["x"] for shard in result_shards] result = np.concatenate(result) assert np.allclose(expected, result)
def load(self, model_path, minPartitions=None): """ restore model from model file and config. :param model_path: the model file :return: the restored model """ self.internal = XShards.load_pickle(model_path, minPartitions=minPartitions)
def test_sparkxshards_with_inbalanced_data(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) def random_pad(data): import numpy as np import random times = random.randint(1, 10) data["x"] = np.concatenate([data["x"]] * times) data["y"] = np.concatenate([data["y"]] * times) return data train_data_shard = train_data_shard.transform_shard(random_pad) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25) trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
def np_to_xshard(x, prefix="x"): x = XShards.partition(x) def transform_to_dict(train_data): return {prefix: train_data} return x.transform_shard(transform_to_dict)
def test_partition_ndarray(self): data = np.random.randn(10, 4) xshards = XShards.partition(data) data_parts = xshards.rdd.collect() reconstructed = np.concatenate(data_parts) assert np.allclose(data, reconstructed)
def get_ray_xshards(): from bigdl.orca.data import XShards import numpy as np ndarray_dict = {"x": np.random.randn(10, 4), "y": np.random.randn(10, 4)} spark_xshards = XShards.partition(ndarray_dict) ray_xshards = RayXShards.from_spark_xshards(spark_xshards) return ray_xshards, ndarray_dict
def test_partition_nested_with_num_shards_specification(self): data1 = np.random.randn(10, 4) data2 = np.random.randn(10, 4) # Reasonable number of shards xshards = XShards.partition({"x": (data1, ), "y": [data2]}, num_shards=2) data_parts = xshards.rdd.collect() data1_parts = [part["x"][0] for part in data_parts] data2_parts = [part["y"][0] for part in data_parts] reconstructed1 = np.concatenate(data1_parts) reconstructed2 = np.concatenate(data2_parts) assert np.allclose(data1, reconstructed1) assert np.allclose(data2, reconstructed2) # Empty shards with pytest.raises(ValueError) as errorInfo: xshards = XShards.partition({"x": (data1, ), "y": [data2]}, num_shards=20) assert errorInfo.type == ValueError assert "number of shards" in str(errorInfo.value)
def test_partition_nested(self): data1 = np.random.randn(10, 4) data2 = np.random.randn(10, 4) xshards = XShards.partition({"x": (data1, ), "y": [data2]}) data_parts = xshards.rdd.collect() data1_parts = [part["x"][0] for part in data_parts] data2_parts = [part["y"][0] for part in data_parts] reconstructed1 = np.concatenate(data1_parts) reconstructed2 = np.concatenate(data2_parts) assert np.allclose(data1, reconstructed1) assert np.allclose(data2, reconstructed2)
def test_partition_list(self): data1 = np.random.randn(10, 4) data2 = np.random.randn(10, 4) xshards = XShards.partition([data1, data2]) data_parts = xshards.rdd.collect() data1_parts = [part[0] for part in data_parts] data2_parts = [part[1] for part in data_parts] reconstructed1 = np.concatenate(data1_parts) reconstructed2 = np.concatenate(data2_parts) assert np.allclose(data1, reconstructed1) assert np.allclose(data2, reconstructed2)
def test_sparkxshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25) trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) train_pos = np.zeros((len(x_train), max_len), dtype=np.int32) val_pos = np.zeros((len(x_test), max_len), dtype=np.int32) for i in range(0, len(x_train)): train_pos[i, :] = np.arange(max_len) val_pos[i, :] = np.arange(max_len) train_dataset = XShards.partition({ "x": (x_train, train_pos), "y": np.array(y_train) }) val_dataset = XShards.partition({ "x": (x_test, val_pos), "y": np.array(y_test) }) token_shape = (max_len, ) position_shape = (max_len, ) token_input = Input(shape=token_shape) position_input = Input(shape=position_shape) O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input]) # Select the first output of the Transformer. The second is the pooled output.
def main(cluster_mode, max_epoch, file_path, batch_size, platform, non_interactive): import matplotlib if not non_interactive and platform == "mac": matplotlib.use('qt5agg') if cluster_mode == "local": init_orca_context(cluster_mode="local", cores=4, memory="3g") elif cluster_mode == "yarn": init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g") elif cluster_mode == "spark-submit": init_orca_context(cluster_mode="spark-submit") load_data(file_path) img_dir = os.path.join(file_path, "train") label_dir = os.path.join(file_path, "train_masks") # Here we only take the first 1000 files for simplicity df_train = pd.read_csv(os.path.join(file_path, 'train_masks.csv')) ids_train = df_train['img'].map(lambda s: s.split('.')[0]) ids_train = ids_train[:1000] x_train_filenames = [] y_train_filenames = [] for img_id in ids_train: x_train_filenames.append(os.path.join(img_dir, "{}.jpg".format(img_id))) y_train_filenames.append( os.path.join(label_dir, "{}_mask.gif".format(img_id))) x_train_filenames, x_val_filenames, y_train_filenames, y_val_filenames = \ train_test_split(x_train_filenames, y_train_filenames, test_size=0.2, random_state=42) def load_and_process_image(path): array = mpimg.imread(path) result = np.array(Image.fromarray(array).resize(size=(128, 128))) result = result.astype(float) result /= 255.0 return result def load_and_process_image_label(path): array = mpimg.imread(path) result = np.array(Image.fromarray(array).resize(size=(128, 128))) result = np.expand_dims(result[:, :, 1], axis=-1) result = result.astype(float) result /= 255.0 return result train_images = np.stack( [load_and_process_image(filepath) for filepath in x_train_filenames]) train_label_images = np.stack([ load_and_process_image_label(filepath) for filepath in y_train_filenames ]) val_images = np.stack( [load_and_process_image(filepath) for filepath in x_val_filenames]) val_label_images = np.stack([ load_and_process_image_label(filepath) for filepath in y_val_filenames ]) train_shards = XShards.partition({ "x": train_images, "y": train_label_images }) val_shards = XShards.partition({"x": val_images, "y": val_label_images}) # Build the U-Net model def conv_block(input_tensor, num_filters): encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(input_tensor) encoder = layers.Activation('relu')(encoder) encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(encoder) encoder = layers.Activation('relu')(encoder) return encoder def encoder_block(input_tensor, num_filters): encoder = conv_block(input_tensor, num_filters) encoder_pool = layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder) return encoder_pool, encoder def decoder_block(input_tensor, concat_tensor, num_filters): decoder = layers.Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same')(input_tensor) decoder = layers.concatenate([concat_tensor, decoder], axis=-1) decoder = layers.Activation('relu')(decoder) decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder) decoder = layers.Activation('relu')(decoder) decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder) decoder = layers.Activation('relu')(decoder) return decoder inputs = layers.Input(shape=(128, 128, 3)) # 128 encoder0_pool, encoder0 = encoder_block(inputs, 16) # 64 encoder1_pool, encoder1 = encoder_block(encoder0_pool, 32) # 32 encoder2_pool, encoder2 = encoder_block(encoder1_pool, 64) # 16 encoder3_pool, encoder3 = encoder_block(encoder2_pool, 128) # 8 center = conv_block(encoder3_pool, 256) # center decoder3 = decoder_block(center, encoder3, 128) # 16 decoder2 = decoder_block(decoder3, encoder2, 64) # 32 decoder1 = decoder_block(decoder2, encoder1, 32) # 64 decoder0 = decoder_block(decoder1, encoder0, 16) # 128 outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(decoder0) net = models.Model(inputs=[inputs], outputs=[outputs]) # Define custom metrics def dice_coeff(y_true, y_pred): smooth = 1. # Flatten y_true_f = tf.reshape(y_true, [-1]) y_pred_f = tf.reshape(y_pred, [-1]) intersection = tf.reduce_sum(y_true_f * y_pred_f) score = (2. * intersection + smooth) / \ (tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth) return score # Define custom loss function def dice_loss(y_true, y_pred): loss = 1 - dice_coeff(y_true, y_pred) return loss def bce_dice_loss(y_true, y_pred): loss = losses.binary_crossentropy(y_true, y_pred) + dice_loss( y_true, y_pred) return loss # compile model net.compile(optimizer=tf.keras.optimizers.Adam(2e-3), loss=bce_dice_loss) print(net.summary()) # create an estimator from keras model est = Estimator.from_keras(keras_model=net) # fit with estimator est.fit(data=train_shards, batch_size=batch_size, epochs=max_epoch) # evaluate with estimator result = est.evaluate(val_shards) print(result) # predict with estimator val_shards.cache() val_image_shards = val_shards.transform_shard( lambda val_dict: {"x": val_dict["x"]}) pred_shards = est.predict(data=val_image_shards, batch_size=batch_size) pred = pred_shards.collect()[0]["prediction"] val_image_label = val_shards.collect()[0] val_image = val_image_label["x"] val_label = val_image_label["y"] if not non_interactive: # visualize 5 predicted results plt.figure(figsize=(10, 20)) for i in range(5): img = val_image[i] label = val_label[i] predicted_label = pred[i] plt.subplot(5, 3, 3 * i + 1) plt.imshow(img) plt.title("Input image") plt.subplot(5, 3, 3 * i + 2) plt.imshow(label[:, :, 0], cmap='gray') plt.title("Actual Mask") plt.subplot(5, 3, 3 * i + 3) plt.imshow(predicted_label, cmap='gray') plt.title("Predicted Mask") plt.suptitle("Examples of Input Image, Label, and Prediction") plt.show() stop_orca_context()