def orca_context_fixture(): from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=8, init_ray_on_spark=True, object_store_memory="1g") yield stop_orca_context()
def test_forecast_tcmf_distributed(self): model = TCMFForecaster(y_iters=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(300) data = np.random.rand(300, 480) input = dict({'id': id, 'y': data}) from zoo.orca import init_orca_context, stop_orca_context init_orca_context(cores=4, spark_log_level="INFO", init_ray_on_spark=True, object_store_memory="1g") model.fit(input, num_workers=4) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon, num_workers=4) yhat_loaded = loaded_model.predict(x=None, horizon=horizon, num_workers=4) yhat_id = yhat_loaded["id"] assert (yhat_id == id).all() yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) np.testing.assert_equal(yhat, yhat_loaded) target_value = np.random.rand(300, horizon) target_value = dict({"y": target_value}) assert model.evaluate(x=None, target_value=target_value, metric=['mse']) stop_orca_context()
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--dir', default='/tmp/data', metavar='N', help='the folder store mnist data') parser.add_argument('--batch-size', type=int, default=256, metavar='N', help='input batch size for training per executor(default: 256)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing per executor(default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 2)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--cluster_mode', type=str, default="local", help='The mode for the Spark cluster. local or yarn.') args = parser.parse_args() torch.manual_seed(args.seed) train_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST(args.dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=False) if args.cluster_mode == "local": init_orca_context(cores=1, memory="2g") elif args.cluster_mode == "yarn": init_orca_context( cluster_mode="yarn-client", cores=4, num_nodes=2, memory="2g", driver_memory="10g", driver_cores=1, conf={"spark.rpc.message.maxSize": "1024", "spark.task.maxFailures": "1", "spark.driver.extraJavaOptions": "-Dbigdl.failure.retryTimes=1"}) model = LeNet() model.train() criterion = nn.NLLLoss() adam = torch.optim.Adam(model.parameters(), args.lr) est = Estimator.from_torch(model=model, optimizer=adam, loss=criterion) est.fit(data=train_loader, epochs=args.epochs, validation_data=test_loader, validation_metrics=[Accuracy()], checkpoint_trigger=EveryEpoch()) result = est.evaluate(data=test_loader, validation_metrics=[Accuracy()]) for r in result: print(str(r)) stop_orca_context()
def main(max_epoch): sc = init_orca_context(cores=4, memory="2g") # get DataSet # as_supervised returns tuple (img, label) instead of dict {'image': img, 'label':label} mnist_train = tfds.load(name="mnist", split="train", as_supervised=True) mnist_test = tfds.load(name="mnist", split="test", as_supervised=True) # Normalizes images, unit8 -> float32 def normalize_img(image, label): return tf.cast(image, tf.float32) / 255., label mnist_train = mnist_train.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) mnist_test = mnist_test.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test) result = est.evaluate(mnist_test) print(result) est.save_keras_model("/tmp/mnist_keras.h5") stop_orca_context()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tensorboard Example') parser.add_argument('--cluster_mode', type=str, default="local", help='The cluster mode, such as local, yarn or k8s.') args = parser.parse_args() if args.cluster_mode == "local": init_orca_context() elif args.cluster_mode == "yarn": init_orca_context(cluster_mode=args.cluster_mode, cores=4, num_nodes=2) writer = SummaryWriter('runs/fashion_mnist_experiment_1') # constant for classes classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') # plot some random training images dataiter = iter(train_data_creator(config={})) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_fashion_mnist_images', img_grid) # inspect the model using tensorboard writer.add_graph(model_creator(config={}), images) writer.close() # training loss vs. epochs criterion = nn.CrossEntropyLoss() orca_estimator = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, loss=criterion, backend="torch_distributed") stats = orca_estimator.fit(train_data_creator, epochs=5, batch_size=4) for stat in stats: writer.add_scalar("training_loss", stat['train_loss'], stat['epoch']) print("Train stats: {}".format(stats)) val_stats = orca_estimator.evaluate(validation_data_creator) print("Validation stats: {}".format(val_stats)) orca_estimator.shutdown() stop_orca_context()
def main(max_epoch): sc = init_orca_context(cores=4, memory="2g") # get DataSet mnist_train = tfds.load(name="mnist", split="train") mnist_test = tfds.load(name="mnist", split="test") # Normalizes images def normalize_img(data): data['image'] = tf.cast(data["image"], tf.float32) / 255. return data mnist_train = mnist_train.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) mnist_test = mnist_test.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) # tensorflow inputs images = tf.placeholder(dtype=tf.float32, shape=(None, 28, 28, 1)) # tensorflow labels labels = tf.placeholder(dtype=tf.int32, shape=(None, )) with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create an estimator est = Estimator.from_graph(inputs=images, outputs=logits, labels=labels, loss=loss, optimizer=tf.train.AdamOptimizer(), metrics={"acc": acc}) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test) result = est.evaluate(mnist_test) print(result) est.save_tf_checkpoint("/tmp/lenet/model") stop_orca_context()
def orca_context_fixture(request): import os from zoo.orca import OrcaContext, init_orca_context, stop_orca_context OrcaContext._eager_mode = True access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id is not None and secret_access_key is not None: env = {"AWS_ACCESS_KEY_ID": access_key_id, "AWS_SECRET_ACCESS_KEY": secret_access_key} else: env = None sc = init_orca_context(cores=4, spark_log_level="INFO", env=env, object_store_memory="1g") yield sc stop_orca_context()
def orca_context_fixture(): sc = init_orca_context(cores=8) def to_array_(v): return v.toArray().tolist() def flatten_(v): result = [] for elem in v: result.extend(elem.toArray().tolist()) return result spark = SparkSession(sc) spark.udf.register("to_array", to_array_, ArrayType(DoubleType())) spark.udf.register("flatten", flatten_, ArrayType(DoubleType())) yield stop_orca_context()
def main(cluster_mode, max_epoch, file_path, batch_size, platform, non_interactive): import matplotlib if not non_interactive and platform == "mac": matplotlib.use('qt5agg') if cluster_mode == "local": init_orca_context(cluster_mode="local", cores=4, memory="3g") elif cluster_mode == "yarn": init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g") load_data(file_path) img_dir = os.path.join(file_path, "train") label_dir = os.path.join(file_path, "train_masks") # Here we only take the first 1000 files for simplicity df_train = pd.read_csv(os.path.join(file_path, 'train_masks.csv')) ids_train = df_train['img'].map(lambda s: s.split('.')[0]) ids_train = ids_train[:1000] x_train_filenames = [] y_train_filenames = [] for img_id in ids_train: x_train_filenames.append(os.path.join(img_dir, "{}.jpg".format(img_id))) y_train_filenames.append( os.path.join(label_dir, "{}_mask.gif".format(img_id))) x_train_filenames, x_val_filenames, y_train_filenames, y_val_filenames = \ train_test_split(x_train_filenames, y_train_filenames, test_size=0.2, random_state=42) def load_and_process_image(path): array = mpimg.imread(path) result = np.array(Image.fromarray(array).resize(size=(128, 128))) result = result.astype(float) result /= 255.0 return result def load_and_process_image_label(path): array = mpimg.imread(path) result = np.array(Image.fromarray(array).resize(size=(128, 128))) result = np.expand_dims(result[:, :, 1], axis=-1) result = result.astype(float) result /= 255.0 return result train_images = np.stack( [load_and_process_image(filepath) for filepath in x_train_filenames]) train_label_images = np.stack([ load_and_process_image_label(filepath) for filepath in y_train_filenames ]) val_images = np.stack( [load_and_process_image(filepath) for filepath in x_val_filenames]) val_label_images = np.stack([ load_and_process_image_label(filepath) for filepath in y_val_filenames ]) train_shards = XShards.partition({ "x": train_images, "y": train_label_images }) val_shards = XShards.partition({"x": val_images, "y": val_label_images}) # Build the U-Net model def conv_block(input_tensor, num_filters): encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(input_tensor) encoder = layers.Activation('relu')(encoder) encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(encoder) encoder = layers.Activation('relu')(encoder) return encoder def encoder_block(input_tensor, num_filters): encoder = conv_block(input_tensor, num_filters) encoder_pool = layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder) return encoder_pool, encoder def decoder_block(input_tensor, concat_tensor, num_filters): decoder = layers.Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same')(input_tensor) decoder = layers.concatenate([concat_tensor, decoder], axis=-1) decoder = layers.Activation('relu')(decoder) decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder) decoder = layers.Activation('relu')(decoder) decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder) decoder = layers.Activation('relu')(decoder) return decoder inputs = layers.Input(shape=(128, 128, 3)) # 128 encoder0_pool, encoder0 = encoder_block(inputs, 16) # 64 encoder1_pool, encoder1 = encoder_block(encoder0_pool, 32) # 32 encoder2_pool, encoder2 = encoder_block(encoder1_pool, 64) # 16 encoder3_pool, encoder3 = encoder_block(encoder2_pool, 128) # 8 center = conv_block(encoder3_pool, 256) # center decoder3 = decoder_block(center, encoder3, 128) # 16 decoder2 = decoder_block(decoder3, encoder2, 64) # 32 decoder1 = decoder_block(decoder2, encoder1, 32) # 64 decoder0 = decoder_block(decoder1, encoder0, 16) # 128 outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(decoder0) net = models.Model(inputs=[inputs], outputs=[outputs]) # Define custom metrics def dice_coeff(y_true, y_pred): smooth = 1. # Flatten y_true_f = tf.reshape(y_true, [-1]) y_pred_f = tf.reshape(y_pred, [-1]) intersection = tf.reduce_sum(y_true_f * y_pred_f) score = (2. * intersection + smooth) / \ (tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth) return score # Define custom loss function def dice_loss(y_true, y_pred): loss = 1 - dice_coeff(y_true, y_pred) return loss def bce_dice_loss(y_true, y_pred): loss = losses.binary_crossentropy(y_true, y_pred) + dice_loss( y_true, y_pred) return loss # compile model net.compile(optimizer=tf.keras.optimizers.Adam(2e-3), loss=bce_dice_loss) print(net.summary()) # create an estimator from keras model est = Estimator.from_keras(keras_model=net) # fit with estimator est.fit(data=train_shards, batch_size=batch_size, epochs=max_epoch) # evaluate with estimator result = est.evaluate(val_shards) print(result) # predict with estimator val_shards.cache() val_image_shards = val_shards.transform_shard( lambda val_dict: {"x": val_dict["x"]}) pred_shards = est.predict(data=val_image_shards, batch_size=batch_size) pred = pred_shards.collect()[0]["prediction"] val_image_label = val_shards.collect()[0] val_image = val_image_label["x"] val_label = val_image_label["y"] if not non_interactive: # visualize 5 predicted results plt.figure(figsize=(10, 20)) for i in range(5): img = val_image[i] label = val_label[i] predicted_label = pred[i] plt.subplot(5, 3, 3 * i + 1) plt.imshow(img) plt.title("Input image") plt.subplot(5, 3, 3 * i + 2) plt.imshow(label[:, :, 0], cmap='gray') plt.title("Actual Mask") plt.subplot(5, 3, 3 * i + 3) plt.imshow(predicted_label, cmap='gray') plt.title("Predicted Mask") plt.suptitle("Examples of Input Image, Label, and Prediction") plt.show() stop_orca_context()
def teardown_method(self, method): stop_orca_context()
def main(): anchors = yolo_anchors anchor_masks = yolo_anchor_masks parser = argparse.ArgumentParser() parser.add_argument("--data_dir", dest="data_dir", help="Required. The path where data locates.") parser.add_argument( "--output_data", dest="output_data", default=tempfile.mkdtemp(), help="Required. The path where voc parquet data locates.") parser.add_argument("--data_year", dest="data_year", default="2009", help="Required. The voc data date.") parser.add_argument("--split_name_train", dest="split_name_train", default="train", help="Required. Split name.") parser.add_argument("--split_name_test", dest="split_name_test", default="val", help="Required. Split name.") parser.add_argument("--names", dest="names", help="Required. The path where class names locates.") parser.add_argument("--weights", dest="weights", default="./checkpoints/yolov3.weights", help="Required. The path where weights locates.") parser.add_argument("--checkpoint", dest="checkpoint", default="./checkpoints/yolov3.tf", help="Required. The path where checkpoint locates.") parser.add_argument( "--checkpoint_folder", dest="checkpoint_folder", default="./checkpoints", help="Required. The path where saved checkpoint locates.") parser.add_argument("--epochs", dest="epochs", type=int, default=2, help="Required. epochs.") parser.add_argument("--batch_size", dest="batch_size", type=int, default=16, help="Required. epochs.") parser.add_argument("--cluster_mode", dest="cluster_mode", default="local", help="Required. Run on local/yarn/k8s mode.") parser.add_argument("--class_num", dest="class_num", type=int, default=20, help="Required. class num.") parser.add_argument( "--worker_num", type=int, default=1, help="The number of slave nodes to be used in the cluster." "You can change it depending on your own cluster setting.") parser.add_argument( "--cores", type=int, default=4, help="The number of cpu cores you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument( "--memory", type=str, default="20g", help="The memory you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument( "--object_store_memory", type=str, default="10g", help="The memory you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument('--k8s_master', type=str, default="", help="The k8s master. " "It should be k8s://https://<k8s-apiserver-host>: " "<k8s-apiserver-port>.") parser.add_argument("--container_image", type=str, default="", help="The runtime k8s image. ") parser.add_argument('--k8s_driver_host', type=str, default="", help="The k8s driver localhost.") parser.add_argument('--k8s_driver_port', type=str, default="", help="The k8s driver port.") options = parser.parse_args() # convert yolov3 weights yolo = YoloV3(classes=80) load_darknet_weights(yolo, options.weights) yolo.save_weights(options.checkpoint) def model_creator(config): model = YoloV3(DEFAULT_IMAGE_SIZE, training=True, classes=options.class_num) anchors = yolo_anchors anchor_masks = yolo_anchor_masks model_pretrained = YoloV3(DEFAULT_IMAGE_SIZE, training=True, classes=80) model_pretrained.load_weights(options.checkpoint) model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) optimizer = tf.keras.optimizers.Adam(lr=1e-3) loss = [ YoloLoss(anchors[mask], classes=options.class_num) for mask in anchor_masks ] model.compile(optimizer=optimizer, loss=loss, run_eagerly=False) return model # prepare data class_map = { name: idx for idx, name in enumerate(open(options.names).read().splitlines()) } dataset_path = os.path.join(options.data_dir, "VOCdevkit") voc_train_path = os.path.join(options.output_data, "train_dataset") voc_val_path = os.path.join(options.output_data, "val_dataset") write_parquet(format="voc", voc_root_path=dataset_path, output_path="file://" + voc_train_path, splits_names=[(options.data_year, options.split_name_train)], classes=class_map) write_parquet(format="voc", voc_root_path=dataset_path, output_path="file://" + voc_val_path, splits_names=[(options.data_year, options.split_name_test)], classes=class_map) output_types = { "image": tf.string, "label": tf.float32, "image_id": tf.string } output_shapes = {"image": (), "label": (None, 5), "image_id": ()} def train_data_creator(config, batch_size): train_dataset = read_parquet(format="tf_dataset", path=voc_train_path, output_types=output_types, output_shapes=output_shapes) train_dataset = train_dataset.map( lambda data_dict: (data_dict["image"], data_dict["label"])) train_dataset = train_dataset.map(parse_data_train) train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.map(lambda x, y: ( transform_images(x, DEFAULT_IMAGE_SIZE), transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) return train_dataset def val_data_creator(config, batch_size): val_dataset = read_parquet(format="tf_dataset", path=voc_val_path, output_types=output_types, output_shapes=output_shapes) val_dataset = val_dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) val_dataset = val_dataset.map(parse_data_train) val_dataset = val_dataset.batch(batch_size) val_dataset = val_dataset.map(lambda x, y: ( transform_images(x, DEFAULT_IMAGE_SIZE), transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE))) return val_dataset callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint(options.checkpoint_folder + '/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] if options.cluster_mode == "local": init_orca_context(cluster_mode="local", cores=options.cores, num_nodes=options.worker_num, memory=options.memory, init_ray_on_spark=True, enable_numa_binding=False, object_store_memory=options.object_store_memory) elif options.cluster_mode == "k8s": init_orca_context(cluster_mode="k8s", master=options.k8s_master, container_image=options.container_image, init_ray_on_spark=True, enable_numa_binding=False, num_nodes=options.worker_num, cores=options.cores, memory=options.memory, object_store_memory=options.object_store_memory, conf={ "spark.driver.host": options.driver_host, "spark.driver.port": options.driver_port }) elif options.cluster_mode == "yarn": init_orca_context(cluster_mode="yarn-client", cores=options.cores, num_nodes=options.worker_num, memory=options.memory, init_ray_on_spark=True, enable_numa_binding=False, object_store_memory=options.object_store_memory) trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(train_data_creator, epochs=options.epochs, batch_size=options.batch_size, steps_per_epoch=3473 // options.batch_size, callbacks=callbacks, validation_data=val_data_creator, validation_steps=3581 // options.batch_size) stop_orca_context()