Пример #1
0
 def setup_method(self, method):
     """ setup any state tied to the execution of the given method in a
     class.  setup_method is invoked for every test method of a class.
     """
     sparkConf = init_spark_conf().setMaster("local[4]").setAppName(
         "test feature set")
     self.sc = init_nncontext(sparkConf)
    def test_dataframe_shard_size(self):
        from zoo.orca import OrcaContext
        OrcaContext._shard_size = 3
        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
Пример #3
0
    def test_openvino(self):
        with tempfile.TemporaryDirectory() as local_path:
            model_url = data_url + "/analytics-zoo-data/openvino2020_resnet50.tar"
            model_path = maybe_download("openvino2020_resnet50.tar",
                                        local_path, model_url)
            cmd = "tar -xvf " + model_path + " -C " + local_path
            subprocess.Popen(cmd.split())
            model_path = os.path.join(
                local_path, "openvino2020_resnet50/resnet_v1_50.xml")
            est = Estimator.from_openvino(model_path=model_path)

            # ndarray
            input_data = np.random.random([20, 4, 3, 224, 224])
            result = est.predict(input_data)
            print(result)

            # xshards
            input_data_list = [
                np.random.random([1, 4, 3, 224, 224]),
                np.random.random([2, 4, 3, 224, 224])
            ]
            sc = init_nncontext()
            rdd = sc.parallelize(input_data_list, numSlices=2)
            shards = SparkXShards(rdd)

            def pre_processing(images):
                return {"x": images}

            shards = shards.transform_shard(pre_processing)
            result = est.predict(shards)
            result_c = result.collect()
            print(result_c)
Пример #4
0
 def create_sc(self, submit_args, conf):
     submit_args = submit_args + " pyspark-shell"
     os.environ["PYSPARK_SUBMIT_ARGS"] = submit_args
     spark_conf = init_spark_conf(conf)
     sc = init_nncontext(conf=spark_conf, spark_log_level=self.spark_log_level,
                         redirect_spark_log=self.redirect_spark_log)
     return sc
Пример #5
0
    def test_estimator_graph_dataframe(self):
        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=tf.train.AdamOptimizer(),
                                   metrics={"loss": model.loss})

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                labels_cols=['label'],
                validation_data=df)

        result = est.evaluate(df,
                              batch_size=4,
                              feature_cols=['user', 'item'],
                              labels_cols=['label'])
        print(result)

        prediction_df = est.predict(df,
                                    batch_size=4,
                                    feature_cols=['user', 'item'])
        assert 'prediction' in prediction_df.columns
        predictions = prediction_df.collect()
        assert len(predictions) == 10
    def test_partition_num_less_than_workers(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=1)
        assert rdd.getNumPartitions() == 1
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() < trainer.num_workers

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
Пример #7
0
 def _create_sc(self, submit_args, conf):
     os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
     zoo_conf = init_spark_conf(conf)
     sc = init_nncontext(conf=zoo_conf,
                         spark_log_level=self.spark_log_level,
                         redirect_spark_log=self.redirect_spark_log)
     return sc
    def test_num_part_data_diff_val_data(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=10)
        val_rdd = sc.range(60, numSlices=8)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])
        val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                        int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() > trainer.num_workers
        assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions()

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=val_df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
Пример #9
0
def main(data_num):

    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
    images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD
    labels_data = labels_data[:data_num].astype(np.int32)
    dataset = TFDataset.from_ndarrays((images_data, labels_data), batch_per_thread=20)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images, num_classes=10, is_training=False)

    predictions = tf.to_int32(tf.argmax(logits, axis=1))
    correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)), axis=1)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "/tmp/lenet/model")

        predictor = TFPredictor(sess, [correct])

        accuracy = predictor.predict().mean()

        print("predict accuracy is %s" % accuracy)
Пример #10
0
def main(option):
    sc = init_nncontext()

    def input_fn(mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
            image_set = ImageSet.read(params["image_path"],
                                      sc=sc,
                                      with_label=True,
                                      one_based_label=False)
            train_transformer = ChainedPreprocessing([
                ImageBytesToMat(),
                ImageResize(256, 256),
                ImageRandomCrop(224, 224),
                ImageRandomPreprocessing(ImageHFlip(), 0.5),
                ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                      0.225),
                ImageMatToTensor(to_RGB=True, format="NHWC"),
                ImageSetToSample(input_keys=["imageTensor"],
                                 target_keys=["label"])
            ])
            feature_set = FeatureSet.image_frame(image_set.to_image_frame())
            feature_set = feature_set.transform(train_transformer)
            feature_set = feature_set.transform(ImageFeatureToSample())
            dataset = TFDataset.from_feature_set(feature_set,
                                                 features=(tf.float32,
                                                           [224, 224, 3]),
                                                 labels=(tf.int32, [1]),
                                                 batch_size=16)
        else:
            raise NotImplementedError

        return dataset

    def model_fn(features, labels, mode, params):
        from nets import inception
        slim = tf.contrib.slim
        labels = tf.squeeze(labels, axis=1)
        with slim.arg_scope(inception.inception_v1_arg_scope()):
            logits, end_points = inception.inception_v1(
                features,
                num_classes=int(params["num_classes"]),
                is_training=True)

        if mode == tf.estimator.ModeKeys.TRAIN:
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                       labels=labels))
            return TFEstimatorSpec(mode, predictions=logits, loss=loss)
        else:
            raise NotImplementedError

    estimator = TFEstimator(model_fn,
                            tf.train.AdamOptimizer(),
                            params={
                                "image_path": option.image_path,
                                "num_classes": option.num_classes
                            })

    estimator.train(input_fn, steps=100)
Пример #11
0
 def setup_method(self, method):
     """ setup any state tied to the execution of the given method in a
     class.  setup_method is invoked for every test method of a class.
     """
     sparkConf = init_spark_conf().setMaster("local[1]").setAppName("testEstimator")
     self.sc = init_nncontext(sparkConf)
     self.sqlContext = SQLContext(self.sc)
     assert(self.sc.appName == "testEstimator")
Пример #12
0
    def _create_sc(self, submit_args, conf):
        from pyspark.sql import SparkSession
        print("pyspark_submit_args is: {}".format(submit_args))
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
        zoo_conf = init_spark_conf(conf)
        sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log)
        sc.setLogLevel(self.spark_log_level)

        return sc
Пример #13
0
def main():
    sc = init_nncontext()

    global_batch_size = 256

    loss = create_model(creat_dataset(global_batch_size))

    optimizer = TFOptimizer.from_loss(loss, SGD(1e-3), model_dir="/tmp/lenet/")
    optimizer.optimize(end_trigger=MaxIteration(20))
Пример #14
0
 def init_spark_on_local(self, cores, conf=None, python_location=None):
     print("Start to getOrCreate SparkContext")
     os.environ['PYSPARK_PYTHON'] = \
         python_location if python_location else self._detect_python_location()
     master = "local[{}]".format(cores)
     zoo_conf = init_spark_conf(conf).setMaster(master)
     sc = init_nncontext(conf=zoo_conf, redirect_spark_log=self.redirect_spark_log)
     sc.setLogLevel(self.spark_log_level)
     print("Successfully got a SparkContext")
     return sc
Пример #15
0
def main():
    sc = init_nncontext()

    def model_fn(features, labels, mode):
        from nets import lenet
        slim = tf.contrib.slim
        with slim.arg_scope(lenet.lenet_arg_scope()):
            logits, end_points = lenet.lenet(features,
                                             num_classes=10,
                                             is_training=True)

        if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN:
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                       labels=labels))
            return TFEstimatorSpec(mode, predictions=logits, loss=loss)
        else:
            return TFEstimatorSpec(mode, predictions=logits)

    def input_fn(mode):
        if mode == tf.estimator.ModeKeys.TRAIN:
            training_rdd = get_data_rdd("train", sc)
            dataset = TFDataset.from_rdd(training_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         labels=(tf.int32, []),
                                         batch_size=320)
        elif mode == tf.estimator.ModeKeys.EVAL:
            testing_rdd = get_data_rdd("test", sc)
            dataset = TFDataset.from_rdd(testing_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         labels=(tf.int32, []),
                                         batch_size=320)
        else:
            testing_rdd = get_data_rdd("test", sc).map(lambda x: x[0])
            dataset = TFDataset.from_rdd(testing_rdd,
                                         features=(tf.float32, [28, 28, 1]),
                                         batch_per_thread=80)

        return dataset

    estimator = TFEstimator(model_fn,
                            tf.train.AdamOptimizer(),
                            model_dir="/tmp/estimator")

    estimator.train(input_fn, steps=60000 // 320)

    metrics = estimator.evaluate(input_fn, ["acc"])
    print(metrics)

    predictions = estimator.predict(input_fn)

    print(predictions.first())
Пример #16
0
    def test_openvino_predict_spark_df(self):
        from pyspark.sql import SparkSession

        sc = init_nncontext()
        spark = SparkSession(sc)
        rdd = sc.range(0, 20, numSlices=2)
        input_df = rdd.map(
            lambda x: (np.random.random([1, 4, 3, 224, 224]).tolist())).toDF(
                ["feature"])
        result_df = self.est.predict(input_df, feature_cols=["feature"])
        assert np.array(result_df.select("prediction").first()).shape == (1, 4,
                                                                          1000)
        assert result_df.count() == 20
Пример #17
0
    def test_dataframe_predict(self):

        sc = init_nncontext()
        rdd = sc.parallelize(range(20))
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result = estimator.predict(df, batch_size=4, feature_cols=["feature"])
        expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
        assert result.selectExpr(expr).first()["error"] == 0
Пример #18
0
def main():
    sc = init_nncontext()

    def model_fn(features, labels, mode):
        from nets import lenet
        slim = tf.contrib.slim
        with slim.arg_scope(lenet.lenet_arg_scope()):
            logits, end_points = lenet.lenet(features,
                                             num_classes=10,
                                             is_training=True)

        if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN:
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                       labels=labels))

            optimizer = ZooOptimizer(tf.train.AdamOptimizer())
            train_op = optimizer.minimize(loss)
            return tf.estimator.EstimatorSpec(mode,
                                              predictions=logits,
                                              loss=loss,
                                              train_op=train_op)
        else:
            return tf.estimator.EstimatorSpec(mode, predictions=logits)

    def input_fn(mode):
        if mode == tf.estimator.ModeKeys.TRAIN:
            training_data = get_data("train")
            dataset = TFDataset.from_ndarrays(training_data, batch_size=320)
        elif mode == tf.estimator.ModeKeys.EVAL:
            testing_data = get_data("test")
            dataset = TFDataset.from_ndarrays(testing_data,
                                              batch_per_thread=80)
        else:
            images, _ = get_data("test")
            dataset = TFDataset.from_ndarrays(images, batch_per_thread=80)

        return dataset

    estimator = TFEstimator.from_model_fn(model_fn, model_dir="/tmp/estimator")

    estimator.train(input_fn, steps=10)

    metrics = estimator.evaluate(input_fn, ["acc"])
    print(metrics)

    predictions = estimator.predict(input_fn)

    print(predictions.first())
    print("finished...")
    sc.stop()
Пример #19
0
    def test_estimator_graph_dataframe_exception(self):

        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=tf.train.AdamOptimizer(),
                                   metrics={"loss": model.loss})

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    validation_data=df)
        self.assertTrue(
            'label columns is None; it should not be None in training' in str(
                context.exception))

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                labels_cols=['label'])
        with self.assertRaises(Exception) as context:
            predictions = est.predict(df, batch_size=4).collect()
        self.assertTrue(
            'feature columns is None; it should not be None in prediction' in
            str(context.exception))

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    labels_cols=['label'],
                    validation_data=[1, 2, 3])
        self.assertTrue(
            'train data and validation data should be both Spark DataFrame' in
            str(context.exception))
def main(data_num):

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(inputs=data, outputs=predictions)

    model.load_weights("/tmp/mnist_keras.h5")

    if DISTRIBUTED:
        # using RDD api to do distributed evaluation
        sc = init_nncontext()
        # get data, pre-process and create TFDataset
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD)])

        dataset = TFDataset.from_rdd(rdd,
                                     names=["features"],
                                     shapes=[[28, 28, 1]],
                                     types=[tf.float32],
                                     batch_per_thread=20)
        predictor = TFPredictor.from_keras(model, dataset)

        accuracy = predictor.predict().zip(labels_rdd).map(
            lambda x: np.argmax(x[0]) == x[1]).mean()

        print("predict accuracy is %s" % accuracy)

    else:
        # using keras api for local evaluation
        model.compile(optimizer='rmsprop',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test")
        images_data = normalizer(images_data, mnist.TRAIN_MEAN,
                                 mnist.TRAIN_STD)
        result = model.evaluate(images_data, labels_data)
        print(model.metrics_names)
        print(result)
Пример #21
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data,
         labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=True)

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    # create a optimizer
    optimizer = TFOptimizer(loss,
                            Adam(1e-3),
                            val_outputs=[logits],
                            val_labels=[labels],
                            val_method=Top1Accuracy())
    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
    def test_openvino_predict_xshards(self):
        input_data_list = [np.array([self.input] * 4), np.array([self.input] * 2)]
        sc = init_nncontext()
        rdd = sc.parallelize(input_data_list, numSlices=2)
        shards = SparkXShards(rdd)

        def pre_processing(images):
            return {"x": images}

        shards = shards.transform_shard(pre_processing)
        result = self.est.predict(shards)
        result_c = result.collect()
        assert isinstance(result, SparkXShards)
        assert result_c[0]["prediction"].shape == (4, 1000)
        assert result_c[1]["prediction"].shape == (2, 1000)
        assert self.check_result(result_c[0]["prediction"], 4)
        assert self.check_result(result_c[1]["prediction"], 2)
    def test_dataframe_predict(self):

        sc = init_nncontext()
        rdd = sc.parallelize(range(100))

        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        df = rdd.map(lambda x: ([float(x)] * 50,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result = estimator.predict(df, batch_size=4, feature_cols=["feature"])
        result = np.concatenate(
            [shard["prediction"] for shard in result.collect()])
        assert np.array_equal(result, np.array(range(100)).astype(np.float))
Пример #24
0
    def write(path,
              generator,
              schema,
              block_size=1000,
              write_mode="overwrite",
              **kwargs):
        """
        Take each record in the generator and write it to a parquet file.

        **generator**
        Each record in the generator is a dict, the key is a string and will be the
        column name of saved parquet record and the value is the data.

        **schema**
        schema defines the name, dtype, shape of a column, as well as the feature
        type of a column. The feature type, defines how to encode and decode the column value.

        There are three kinds of feature type:
        1. Scalar, such as a int or float number, or a string, which can be directly mapped
           to a parquet type
        2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding
           parquet type is BYTE_ARRAY .
        3. Image, which takes a string representing a image file in local file system and save
           the raw file content bytes.
           The corresponding parquet type is BYTE_ARRAY.

        :param path: the output path, e.g. file:///output/path, hdfs:///output/path
        :param generator: generate a dict, whose key is a string and value is one of
                          (a scalar value, ndarray, image file path)
        :param schema: a dict, whose key is a string, value is one of
                      (schema_field.Scalar, schema_field.NDarray, schema_field.Image)
        :param kwargs: other args
        """

        sc = init_nncontext()
        spark = SparkSession(sc)
        node_num, core_num = get_node_and_core_number()
        for i, chunk in enumerate(chunks(generator, block_size)):
            chunk_path = os.path.join(path, f"chunk={i}")
            rows_rdd = sc.parallelize(chunk, core_num * node_num)\
                .map(lambda x: dict_to_row(schema, x))
            spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet(
                chunk_path)
        metadata_path = os.path.join(path, "_orca_metadata")

        write_text(metadata_path, encode_schema(schema))
Пример #25
0
    def test_xshards_predict(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)
Пример #26
0
    def test_dataframe_train_eval(self):

        sc = init_nncontext()
        rdd = sc.range(0, 100)
        df = rdd.map(lambda x: (np.random.randn(50).astype(np.float).tolist(
        ), [int(np.random.randint(0, 2, size=()))])).toDF(["feature", "label"])

        estimator = get_estimator(workers_per_node=2)
        estimator.fit(df,
                      batch_size=4,
                      epochs=2,
                      feature_cols=["feature"],
                      label_cols=["label"])
        estimator.evaluate(df,
                           batch_size=4,
                           feature_cols=["feature"],
                           label_cols=["label"])
Пример #27
0
def main(max_epoch):
    _ = init_nncontext()

    (training_images_data,
     training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    (testing_images_data,
     testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test")

    training_images_data = (training_images_data -
                            mnist.TRAIN_MEAN) / mnist.TRAIN_STD
    testing_images_data = (testing_images_data -
                           mnist.TRAIN_MEAN) / mnist.TRAIN_STD

    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax'),
    ])

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    keras_model = KerasModel(model)

    keras_model.fit(training_images_data,
                    training_labels_data,
                    validation_data=(testing_images_data, testing_labels_data),
                    epochs=max_epoch,
                    batch_size=320,
                    distributed=True)

    result = keras_model.evaluate(testing_images_data,
                                  testing_labels_data,
                                  distributed=True,
                                  batch_per_thread=80)

    print(result)
    # >> [0.08865142822265625, 0.9722]

    # the following assert is used for internal testing
    assert result['acc Top1Accuracy'] > 0.95

    keras_model.save_weights("/tmp/mnist_keras.h5")
Пример #28
0
def main(max_epoch, data_num):
    sc = init_nncontext()

    # get data, pre-process and create TFDataset
    def get_data_rdd(dataset):
        (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset)
        image_rdd = sc.parallelize(images_data[:data_num])
        labels_rdd = sc.parallelize(labels_data[:data_num])
        rdd = image_rdd.zip(labels_rdd) \
            .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD),
                                    np.array(rec_tuple[1])])
        return rdd

    training_rdd = get_data_rdd("train")
    testing_rdd = get_data_rdd("test")
    dataset = TFDataset.from_rdd(training_rdd,
                                 names=["features", "labels"],
                                 shapes=[[28, 28, 1], []],
                                 types=[tf.float32, tf.int32],
                                 batch_size=280,
                                 val_rdd=testing_rdd
                                 )

    data = Input(shape=[28, 28, 1])

    x = Flatten()(data)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    predictions = Dense(10, activation='softmax')(x)

    model = Model(input=data, output=predictions)

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    optimizer = TFOptimizer.from_keras(model, dataset)

    optimizer.set_train_summary(TrainSummary("/tmp/az_lenet", "lenet"))
    optimizer.set_val_summary(ValidationSummary("/tmp/az_lenet", "lenet"))
    # kick off training
    optimizer.optimize(end_trigger=MaxEpoch(max_epoch))

    saver = tf.train.Saver()
    saver.save(optimizer.sess, "/tmp/lenet/")
Пример #29
0
    def test_openvino_predict_xshards(self):
        input_data_list = [
            np.random.random([1, 4, 3, 224, 224]),
            np.random.random([2, 4, 3, 224, 224])
        ]
        sc = init_nncontext()
        rdd = sc.parallelize(input_data_list, numSlices=2)
        shards = SparkXShards(rdd)

        def pre_processing(images):
            return {"x": images}

        shards = shards.transform_shard(pre_processing)
        result = self.est.predict(shards)
        result_c = result.collect()
        assert isinstance(result, SparkXShards)
        assert result_c[0]["prediction"].shape == (1, 4, 1000)
        assert result_c[1]["prediction"].shape == (2, 4, 1000)
Пример #30
0
 def test_spark_xshards(self):
     from zoo import init_nncontext
     from zoo.orca.data import SparkXShards
     estimator = get_estimator(workers_per_node=1)
     sc = init_nncontext()
     x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32))
     # torch 1.7.1+ requires target size same as output size, which is (batch, 1)
     y_rdd = sc.parallelize(
         np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32))
     rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]})
     train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1])
     train_xshards = SparkXShards(train_rdd)
     val_xshards = SparkXShards(val_rdd)
     train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2)
     print(train_stats)
     val_stats = estimator.evaluate(val_xshards, batch_size=128)
     print(val_stats)
     estimator.shutdown()