def test_partition_num_less_than_workers(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=1)
        assert rdd.getNumPartitions() == 1
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() < trainer.num_workers

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
    def test_parquet_images_training(self):
        from bigdl.orca.learn.tf2 import Estimator
        temp_dir = tempfile.mkdtemp()
        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            output_shapes = {"id": (), "image": (), "label": ()}

            def data_creator(config, batch_size):
                dataset = read_parquet("tf_dataset",
                                       path=path,
                                       output_types=output_types,
                                       output_shapes=output_shapes)
                dataset = dataset.shuffle(10)
                dataset = dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
                dataset = dataset.map(parse_data_train)
                dataset = dataset.batch(batch_size)
                return dataset

            ray_ctx = RayContext.get()
            trainer = Estimator.from_keras(model_creator=model_creator)
            trainer.fit(data=data_creator, epochs=1, batch_size=2)
        finally:
            shutil.rmtree(temp_dir)
    def test_predict_xshards(self):
        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100, ))
        })
        expected = train_data_shard.collect()

        expected = [shard["x"] for shard in expected]

        for x in expected:
            print(x.shape)

        expected = np.concatenate(expected)

        config = {}
        trainer = Estimator.from_keras(model_creator=identity_model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        result_shards = trainer.predict(train_data_shard,
                                        batch_size=10).collect()

        result = [shard["prediction"] for shard in result_shards]
        expected_result = [shard["x"] for shard in result_shards]

        result = np.concatenate(result)

        assert np.allclose(expected, result)
    def test_pandas_dataframe(self):
        def model_creator(config):
            import tensorflow as tf
            input1 = tf.keras.layers.Input(shape=(1, ))
            input2 = tf.keras.layers.Input(shape=(1, ))
            concatenation = tf.concat([input1, input2], axis=-1)
            outputs = tf.keras.layers.Dense(
                units=1, activation='softmax')(concatenation)
            model = tf.keras.Model(inputs=[input1, input2], outputs=outputs)
            model.compile(**compile_args(config))
            return model

        file_path = os.path.join(resource_path, "orca/learn/ncf2.csv")
        train_data_shard = bigdl.orca.data.pandas.read_csv(file_path)

        config = {"lr": 0.8}

        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=1)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["user", "item"],
                    label_cols=["label"])
        trainer.evaluate(train_data_shard,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["user", "item"],
                         label_cols=["label"])
        trainer.predict(train_data_shard, feature_cols=["user",
                                                        "item"]).collect()
    def test_sparkxshards_with_inbalanced_data(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        def random_pad(data):
            import numpy as np
            import random
            times = random.randint(1, 10)
            data["x"] = np.concatenate([data["x"]] * times)
            data["y"] = np.concatenate([data["y"]] * times)
            return data

        train_data_shard = train_data_shard.transform_shard(random_pad)

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
    def test_num_part_data_diff_val_data(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=10)
        val_rdd = sc.range(60, numSlices=8)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])
        val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                        int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() > trainer.num_workers
        assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions()

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=val_df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
    def test_dataframe_shard_size(self):
        from bigdl.orca import OrcaContext
        OrcaContext._shard_size = 3
        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
        OrcaContext._shard_size = None
    def test_dataframe_with_empty_partition(self):
        from bigdl.orca import OrcaContext
        sc = OrcaContext.get_spark_context()
        rdd = sc.range(0, 10)

        rdd_with_empty = rdd.repartition(4).\
            mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part)

        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                           int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
    def test_save_and_load(self):
        def model_creator(config):
            import tensorflow as tf
            model = tf.keras.Sequential([
                tf.keras.layers.Conv2D(64,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       activation='relu',
                                       padding='valid'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                             strides=(2, 2),
                                             padding='valid'),
                tf.keras.layers.Conv2D(64,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       activation='relu',
                                       padding='valid'),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                             strides=(2, 2),
                                             padding='valid'),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(10, activation='softmax')
            ])
            model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                          loss='sparse_categorical_crossentropy',
                          metrics=['accuracy'])
            return model

        def train_data_creator(config, batch_size):
            dataset = tf.data.Dataset.from_tensor_slices(
                (np.random.randn(100, 28, 28,
                                 3), np.random.randint(0, 10, (100, 1))))
            dataset = dataset.repeat()
            dataset = dataset.shuffle(1000)
            dataset = dataset.batch(batch_size)
            return dataset

        batch_size = 320
        try:
            est = Estimator.from_keras(model_creator=model_creator,
                                       workers_per_node=2)

            history = est.fit(train_data_creator,
                              epochs=1,
                              batch_size=batch_size,
                              steps_per_epoch=5)
            print("start saving")
            est.save("/tmp/cifar10_keras.ckpt")
            est.load("/tmp/cifar10_keras.ckpt")
            print("save success")
        finally:
            os.remove("/tmp/cifar10_keras.ckpt")
Exemplo n.º 10
0
    def test_dataframe_predict(self):
        sc = init_nncontext()
        rdd = sc.parallelize(range(20))
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        estimator = Estimator.from_keras(model_creator=identity_model_creator,
                                         verbose=True,
                                         config={},
                                         workers_per_node=2)
        result = estimator.predict(df, batch_size=4, feature_cols=["feature"])
        expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
        assert result.selectExpr(expr).first()["error"] == 0
Exemplo n.º 11
0
    def test_horovod_learning_rate_schedule(self):
        import horovod
        major, minor, patch = horovod.__version__.split(".")

        larger_major = int(major) > 0
        larger_minor = int(major) == 0 and int(minor) > 19
        larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2

        if larger_major or larger_minor or larger_patch:
            ray_ctx = RayContext.get()
            batch_size = 32
            workers_per_node = 4
            global_batch_size = batch_size * workers_per_node
            config = {"lr": 0.8}
            trainer = Estimator.from_keras(model_creator=simple_model,
                                           compile_args_creator=compile_args,
                                           verbose=True,
                                           config=config,
                                           backend="horovod",
                                           workers_per_node=workers_per_node)
            import horovod.tensorflow.keras as hvd
            callbacks = [
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                         initial_lr=0.4,
                                                         verbose=True),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=5,
                                                           end_epoch=10,
                                                           multiplier=1.,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=10,
                                                           end_epoch=15,
                                                           multiplier=1e-1,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=15,
                                                           end_epoch=20,
                                                           multiplier=1e-2,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=20,
                                                           multiplier=1e-3,
                                                           initial_lr=0.4),
                LRChecker()
            ]
            for i in range(30):
                trainer.fit(create_train_datasets,
                            epochs=1,
                            batch_size=global_batch_size,
                            callbacks=callbacks)
        else:
            # skip tests in horovod lower version
            pass
    def test_dataframe(self):
        sc = OrcaContext.get_spark_context()
        rdd = sc.range(0, 100)
        spark = OrcaContext.get_spark_session()

        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 2, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.2}

        try:
            temp_dir = tempfile.mkdtemp()

            trainer = Estimator.from_keras(model_creator=model_creator,
                                           verbose=True,
                                           config=config,
                                           workers_per_node=2,
                                           backend="spark",
                                           model_dir=temp_dir)

            res = trainer.fit(df,
                              epochs=5,
                              batch_size=4,
                              steps_per_epoch=25,
                              feature_cols=["feature"],
                              label_cols=["label"],
                              validation_data=df,
                              validation_steps=1)

            print("start saving")
            trainer.save_weights(os.path.join(temp_dir, "cifar10_keras.h5"))
            trainer.load_weights(os.path.join(temp_dir, "cifar10_keras.h5"))
            trainer.save(os.path.join(temp_dir, "a.model"))
            trainer.load(os.path.join(temp_dir, "a.model"))
            res = trainer.evaluate(df,
                                   batch_size=4,
                                   num_steps=25,
                                   feature_cols=["feature"],
                                   label_cols=["label"])
            print("validation result: ", res)

            res = trainer.predict(df, feature_cols=["feature"]).collect()
        finally:
            shutil.rmtree(temp_dir)
Exemplo n.º 13
0
    def test_sparkxshards(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
Exemplo n.º 14
0
    def test_auto_shard_tf(self):
        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator.from_keras(model_creator=auto_shard_model_creator,
                                       verbose=True,
                                       backend="tf2",
                                       workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets,
                            epochs=1,
                            batch_size=4,
                            steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
Exemplo n.º 15
0
    def test_string_input(self):
        def model_creator(config):
            import tensorflow as tf
            vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
                max_tokens=10, output_mode='int', output_sequence_length=4)
            model = tf.keras.models.Sequential()
            model.add(tf.keras.Input(shape=(1, ), dtype=tf.string))
            model.add(vectorize_layer)
            return model

        from bigdl.orca import OrcaContext
        from pyspark.sql.types import StructType, StructField, StringType
        spark = OrcaContext.get_spark_session()
        schema = StructType([StructField("input", StringType(), True)])
        input_data = [["foo qux bar"], ["qux baz"]]
        input_df = spark.createDataFrame(input_data, schema)
        estimator = Estimator.from_keras(model_creator=model_creator)
        output_df = estimator.predict(input_df,
                                      batch_size=1,
                                      feature_cols=["input"])
        output = output_df.collect()
        print(output)
Exemplo n.º 16
0
def main():
    anchors = yolo_anchors
    anchor_masks = yolo_anchor_masks

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        dest="data_dir",
                        help="Required. The path where data locates.")
    parser.add_argument(
        "--output_data",
        dest="output_data",
        default=tempfile.mkdtemp(),
        help="Required. The path where voc parquet data locates.")
    parser.add_argument("--data_year",
                        dest="data_year",
                        default="2009",
                        help="Required. The voc data date.")
    parser.add_argument("--split_name_train",
                        dest="split_name_train",
                        default="train",
                        help="Required. Split name.")
    parser.add_argument("--split_name_test",
                        dest="split_name_test",
                        default="val",
                        help="Required. Split name.")
    parser.add_argument("--names",
                        dest="names",
                        help="Required. The path where class names locates.")
    parser.add_argument("--weights",
                        dest="weights",
                        default="./checkpoints/yolov3.weights",
                        help="Required. The path where weights locates.")
    parser.add_argument("--checkpoint",
                        dest="checkpoint",
                        default="./checkpoints/yolov3.tf",
                        help="Required. The path where checkpoint locates.")
    parser.add_argument(
        "--checkpoint_folder",
        dest="checkpoint_folder",
        default="./checkpoints",
        help="Required. The path where saved checkpoint locates.")
    parser.add_argument("--epochs",
                        dest="epochs",
                        type=int,
                        default=2,
                        help="Required. epochs.")
    parser.add_argument("--batch_size",
                        dest="batch_size",
                        type=int,
                        default=16,
                        help="Required. epochs.")
    parser.add_argument(
        "--cluster_mode",
        dest="cluster_mode",
        default="local",
        help="Required. Run on local/yarn/k8s/spark-submit mode.")
    parser.add_argument("--class_num",
                        dest="class_num",
                        type=int,
                        default=20,
                        help="Required. class num.")
    parser.add_argument(
        "--worker_num",
        type=int,
        default=1,
        help="The number of slave nodes to be used in the cluster."
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--cores",
        type=int,
        default=4,
        help="The number of cpu cores you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--memory",
        type=str,
        default="20g",
        help="The memory you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--object_store_memory",
        type=str,
        default="10g",
        help="The memory you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument("--enable_numa_binding",
                        dest="enable_numa_binding",
                        default=False,
                        help="enable_numa_binding")
    parser.add_argument('--k8s_master',
                        type=str,
                        default="",
                        help="The k8s master. "
                        "It should be k8s://https://<k8s-apiserver-host>: "
                        "<k8s-apiserver-port>.")
    parser.add_argument("--container_image",
                        type=str,
                        default="",
                        help="The runtime k8s image. ")
    parser.add_argument('--k8s_driver_host',
                        type=str,
                        default="",
                        help="The k8s driver localhost.")
    parser.add_argument('--k8s_driver_port',
                        type=str,
                        default="",
                        help="The k8s driver port.")
    parser.add_argument('--nfs_mount_path',
                        type=str,
                        default="",
                        help="nfs mount path")

    options = parser.parse_args()

    if options.cluster_mode == "local":
        init_orca_context(cluster_mode="local",
                          cores=options.cores,
                          num_nodes=options.worker_num,
                          memory=options.memory,
                          init_ray_on_spark=True,
                          object_store_memory=options.object_store_memory)
    elif options.cluster_mode == "k8s":
        init_orca_context(
            cluster_mode="k8s",
            master=options.k8s_master,
            container_image=options.container_image,
            init_ray_on_spark=True,
            enable_numa_binding=options.enable_numa_binding,
            num_nodes=options.worker_num,
            cores=options.cores,
            memory=options.memory,
            object_store_memory=options.object_store_memory,
            conf={
                "spark.driver.host":
                options.driver_host,
                "spark.driver.port":
                options.driver_port,
                "spark.kubernetes.executor.volumes.persistentVolumeClaim."
                "nfsvolumeclaim.options.claimName":
                "nfsvolumeclaim",
                "spark.kubernetes.executor.volumes.persistentVolumeClaim."
                "nfsvolumeclaim.mount.path":
                options.nfs_mount_path,
                "spark.kubernetes.driver.volumes.persistentVolumeClaim."
                "nfsvolumeclaim.options.claimName":
                "nfsvolumeclaim",
                "spark.kubernetes.driver.volumes.persistentVolumeClaim."
                "nfsvolumeclaim.mount.path":
                options.nfs_mount_path
            })
    elif options.cluster_mode == "yarn":
        init_orca_context(cluster_mode="yarn-client",
                          cores=options.cores,
                          num_nodes=options.worker_num,
                          memory=options.memory,
                          init_ray_on_spark=True,
                          enable_numa_binding=options.enable_numa_binding,
                          object_store_memory=options.object_store_memory)
    elif options.cluster_mode == "spark-submit":
        init_orca_context(cluster_mode="spark-submit")
    # convert yolov3 weights
    yolo = YoloV3(classes=80)
    load_darknet_weights(yolo, options.weights)
    yolo.save_weights(options.checkpoint)

    def model_creator(config):
        model = YoloV3(DEFAULT_IMAGE_SIZE,
                       training=True,
                       classes=options.class_num)
        anchors = yolo_anchors
        anchor_masks = yolo_anchor_masks

        model_pretrained = YoloV3(DEFAULT_IMAGE_SIZE,
                                  training=True,
                                  classes=80)
        model_pretrained.load_weights(options.checkpoint)

        model.get_layer('yolo_darknet').set_weights(
            model_pretrained.get_layer('yolo_darknet').get_weights())
        freeze_all(model.get_layer('yolo_darknet'))

        optimizer = tf.keras.optimizers.Adam(lr=1e-3)
        loss = [
            YoloLoss(anchors[mask], classes=options.class_num)
            for mask in anchor_masks
        ]
        model.compile(optimizer=optimizer, loss=loss, run_eagerly=False)
        return model

    # prepare data
    class_map = {
        name: idx
        for idx, name in enumerate(open(options.names).read().splitlines())
    }
    dataset_path = os.path.join(options.data_dir, "VOCdevkit")
    voc_train_path = os.path.join(options.output_data, "train_dataset")
    voc_val_path = os.path.join(options.output_data, "val_dataset")

    write_parquet(format="voc",
                  voc_root_path=dataset_path,
                  output_path="file://" + voc_train_path,
                  splits_names=[(options.data_year, options.split_name_train)],
                  classes=class_map)
    write_parquet(format="voc",
                  voc_root_path=dataset_path,
                  output_path="file://" + voc_val_path,
                  splits_names=[(options.data_year, options.split_name_test)],
                  classes=class_map)

    output_types = {
        "image": tf.string,
        "label": tf.float32,
        "image_id": tf.string
    }
    output_shapes = {"image": (), "label": (None, 5), "image_id": ()}

    def train_data_creator(config, batch_size):
        train_dataset = read_parquet(format="tf_dataset",
                                     path=voc_train_path,
                                     output_types=output_types,
                                     output_shapes=output_shapes)
        train_dataset = train_dataset.map(
            lambda data_dict: (data_dict["image"], data_dict["label"]))
        train_dataset = train_dataset.map(parse_data_train)
        train_dataset = train_dataset.shuffle(buffer_size=512)
        train_dataset = train_dataset.batch(batch_size)
        train_dataset = train_dataset.map(lambda x, y: (
            transform_images(x, DEFAULT_IMAGE_SIZE),
            transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE)))
        train_dataset = train_dataset.prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)
        return train_dataset

    def val_data_creator(config, batch_size):
        val_dataset = read_parquet(format="tf_dataset",
                                   path=voc_val_path,
                                   output_types=output_types,
                                   output_shapes=output_shapes)
        val_dataset = val_dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
        val_dataset = val_dataset.map(parse_data_train)
        val_dataset = val_dataset.batch(batch_size)
        val_dataset = val_dataset.map(lambda x, y: (
            transform_images(x, DEFAULT_IMAGE_SIZE),
            transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE)))
        return val_dataset

    callbacks = [
        ReduceLROnPlateau(verbose=1),
        EarlyStopping(patience=3, verbose=1),
        ModelCheckpoint(options.checkpoint_folder + '/yolov3_train_{epoch}.tf',
                        verbose=1,
                        save_weights_only=True),
        TensorBoard(log_dir='logs')
    ]

    trainer = Estimator.from_keras(model_creator=model_creator)

    trainer.fit(train_data_creator,
                epochs=options.epochs,
                batch_size=options.batch_size,
                steps_per_epoch=3473 // options.batch_size,
                callbacks=callbacks,
                validation_data=val_data_creator,
                validation_steps=3581 // options.batch_size)
    stop_orca_context()
Exemplo n.º 17
0
    initial_lr = 0.1 * lr_multiplier
    callbacks = get_lr_schedule_callbacks(initial_lr)

    config = {
        "wd": 0.00005,
        "momentum": 0.9,
        "warmup_epoch": 5,
        "num_worker": args.worker_num,
        "data_dir": args.data_dir,
        "bf16": args.use_bf16,
        "lr": initial_lr,
    }

    trainer = Estimator.from_keras(model_creator=model_creator,
                                   compile_args_creator=compile_args_creator,
                                   verbose=True,
                                   config=config,
                                   backend="horovod")

    if args.benchmark:
        trainer.fit(
            data=train_data_creator
            if not args.use_dummy_data else dummy_data_creator,
            epochs=3,
            batch_size=global_batch_size,
            steps_per_epoch=20,
            callbacks=callbacks,
        )
    else:
        epoch = 0
        for i in range(5):
Exemplo n.º 18
0
    def impl_test_fit_and_evaluate(self, backend):
        import tensorflow as tf
        ray_ctx = RayContext.get()
        batch_size = 32
        global_batch_size = batch_size * ray_ctx.num_ray_nodes

        if backend == "horovod":
            trainer = Estimator.from_keras(model_creator=simple_model,
                                           compile_args_creator=compile_args,
                                           verbose=True,
                                           config=None,
                                           backend=backend)
        else:

            trainer = Estimator.from_keras(model_creator=model_creator,
                                           verbose=True,
                                           config=None,
                                           backend=backend,
                                           workers_per_node=2)

        # model baseline performance
        start_stats = trainer.evaluate(create_test_dataset,
                                       batch_size=global_batch_size,
                                       num_steps=NUM_TEST_SAMPLES //
                                       global_batch_size)
        print(start_stats)

        def scheduler(epoch):
            if epoch < 2:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.1 * (2 - epoch))

        scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler,
                                                             verbose=1)
        # train for 2 epochs
        trainer.fit(create_train_datasets,
                    epochs=2,
                    batch_size=global_batch_size,
                    steps_per_epoch=10,
                    callbacks=[scheduler])
        trainer.fit(create_train_datasets,
                    epochs=2,
                    batch_size=global_batch_size,
                    steps_per_epoch=10,
                    callbacks=[scheduler])

        # model performance after training (should improve)
        end_stats = trainer.evaluate(create_test_dataset,
                                     batch_size=global_batch_size,
                                     num_steps=NUM_TEST_SAMPLES //
                                     global_batch_size)
        print(end_stats)

        # sanity check that training worked
        dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
        dmse = (end_stats["validation_mean_squared_error"] -
                start_stats["validation_mean_squared_error"])
        print(f"dLoss: {dloss}, dMSE: {dmse}")

        assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"