예제 #1
0
    def test_sparkml_r_pipeline(self):
        # add additional jar files before creating SparkSession
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "iris.csv")
        data = self.spark.read.format('csv') \
            .options(header='true', inferschema='true').load(input_path) \
            .drop('_index_')

        # read the model from disk
        pipeline_path = os.path.join(this_script_dir, "mlpmodel")
        model = PipelineModel.load(path=pipeline_path)

        # create Onnx model
        model_onnx = convert_sparkml(model,
                                     'Sparkml R Pipeline',
                                     buildInitialTypesSimple(data),
                                     spark_session=self.spark)
        # save Onnx model for runtime usage
        if model_onnx is None:
            raise AssertionError("Failed to create the onnx model")
        model_path = os.path.join(this_script_dir, "tests_dump",
                                  "r_pipeline_model.onnx")
        with open(model_path, "wb") as f:
            f.write(model_onnx.SerializeToString())

        data_np = buildInputDictSimple(data)
        # run the model in Spark
        spark_prediction = model.transform(data)
        # run the model in onnx runtime
        output, session = run_with_runtime(data_np, model_path)

        # compare results
        expected = [
            spark_prediction.toPandas().label.values.astype(numpy.float32),
            spark_prediction.toPandas().prediction.values.astype(
                numpy.float32),
            spark_prediction.toPandas().probability.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        ]
        _compare_expected(expected,
                          output,
                          session,
                          model_path,
                          decimal=5,
                          onnx_shape=None)
예제 #2
0
    def test_profile_sparkml_pipeline(self):
        import inspect
        import os
        import numpy
        import pandas
        import time
        import pathlib
        import mleap.pyspark
        from mleap.pyspark.spark_support import SimpleSparkSerializer
        from pyspark.ml import PipelineModel

        # add additional jar files before creating SparkSession
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv') \
            .options(header='true', inferschema='true').load(input_path)
        training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1)

        label = "income"
        dtypes = dict(training_data.dtypes)
        dtypes.pop(label)

        si_xvars = []
        ohe_xvars = []
        feature_cols = []
        for idx, key in enumerate(dtypes):
            if dtypes[key] == "string":
                feature_col = "-".join([key, "encoded"])
                feature_cols.append(feature_col)

                tmp_col = "-".join([key, "tmp"])
                si_xvars.append(StringIndexer(inputCol=key, outputCol=tmp_col, handleInvalid="skip"))
                ohe_xvars.append(OneHotEncoderEstimator(inputCols=[tmp_col], outputCols=[feature_col], dropLast=False))
            else:
                feature_cols.append(key)
        si_label = StringIndexer(inputCol=label, outputCol='label')
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        lr = LogisticRegression(regParam=0.001)
        pipeline = Pipeline(stages=si_xvars + ohe_xvars + [si_label, assembler, lr])

        # filter out the records which will cause error
        # use only one record for prediction
        test_data = test_data.limit(1)
        # create Spark and Onnx models
        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', buildInitialTypesSimple(test_data))
        # save Onnx model for runtime usage
        if model_onnx is None: raise AssertionError("Failed to create the onnx model")
        model_path = os.path.join("tests", "profile_pipeline_model.onnx")
        with open(model_path, "wb") as f:
            f.write(model_onnx.SerializeToString())

        # Create MLeap model
        model_zip_path = os.path.join(this_script_dir, "tests", "mleap-pipeline.zip")
        if os.path.exists(model_zip_path):
            os.remove(model_zip_path)
        model_zip_url = "jar:" + pathlib.Path(model_zip_path).as_uri()
        # save the pipeline also in MLeap format
        empty_df = self.spark.createDataFrame([], model.transform(test_data).schema)
        model.serializeToBundle(model_zip_url, empty_df)
        mleap_pipeline = PipelineModel.deserializeFromBundle(model_zip_url)

        spark_times = []
        mleap_times = []
        runtime_times = []
        for i in range(0, 20):
            data_np = buildInputDictSimple(test_data)
            # run the model in Spark
            start = time.time()
            spark_prediction = model.transform(test_data)
            end = time.time()
            spark_times.append(1000 * (end - start))

            # run with MLeap
            start = time.time()
            mleap_prediction = mleap_pipeline.transform(test_data)
            end = time.time()
            mleap_times.append(1000 * (end - start))

            if i == 0:  # compare only once
                _compare_mleap_pyspark(mleap_prediction, spark_prediction)

            # run the model in onnx runtime
            start = time.time()
            output, session = run_with_runtime(data_np, model_path)
            end = time.time()
            runtime_times.append(1000 * (end - start))

            # compare results
            if i == 0:  # compare only once
                expected = [
                    spark_prediction.toPandas().label.values.astype(numpy.float32),
                    spark_prediction.toPandas().prediction.values.astype(numpy.float32),
                    spark_prediction.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(
                        numpy.float32)
                ]
                _compare_expected(expected, output, session, model_path, decimal=5, onnx_shape=None)

        gen_plot(spark_times, mleap_times, runtime_times)
예제 #3
0
    def test_profile_sparkml_pipeline(self):
        import inspect
        import os
        import numpy
        import pandas
        import time
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv') \
            .options(header='true', inferschema='true').load(input_path)
        training_data, test_data = full_data.randomSplit([0.9, 0.1], seed=1)

        label = "income"
        dtypes = dict(training_data.dtypes)
        dtypes.pop(label)

        si_xvars = []
        ohe_xvars = []
        feature_cols = []
        for idx, key in enumerate(dtypes):
            if dtypes[key] == "string":
                feature_col = "-".join([key, "encoded"])
                feature_cols.append(feature_col)

                tmp_col = "-".join([key, "tmp"])
                si_xvars.append(
                    StringIndexer(inputCol=key,
                                  outputCol=tmp_col,
                                  handleInvalid="skip"))
                ohe_xvars.append(
                    OneHotEncoderEstimator(inputCols=[tmp_col],
                                           outputCols=[feature_col],
                                           dropLast=False))
            else:
                feature_cols.append(key)
        si_label = StringIndexer(inputCol=label, outputCol='label')
        assembler = VectorAssembler(inputCols=feature_cols,
                                    outputCol="features")
        lr = LogisticRegression(regParam=0.001)
        pipeline = Pipeline(stages=si_xvars + ohe_xvars +
                            [si_label, assembler, lr])

        # filter out the records which will cause error
        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline',
                                     buildInitialTypesSimple(test_data))
        if model_onnx is None:
            raise AssertionError("Failed to create the onnx model")
        model_path = os.path.join("tests", "profile_pipeline_model.onnx")
        with open(model_path, "wb") as f:
            f.write(model_onnx.SerializeToString())

        rec_counts = []
        spark_times = []
        runtime_times = []
        for i in range(0, 4):
            rec_counts.append(test_data.count())
            data_np = buildInputDictSimple(test_data)
            # run the model in Spark
            start = time.time()
            predicted = model.transform(test_data)
            end = time.time()
            spark_times.append(1000 * (end - start))

            # test for correctness also
            expected = [
                predicted.toPandas().label.values.astype(numpy.float32),
                predicted.toPandas().prediction.values.astype(numpy.float32),
                predicted.toPandas().probability.apply(
                    lambda x: pandas.Series(x.toArray())).values.astype(
                        numpy.float32)
            ]
            # run the model in onnx runtime
            start = time.time()
            output, session = run_with_runtime(data_np, model_path)
            end = time.time()
            runtime_times.append(1000 * (end - start))

            # compare results
            _compare_expected(expected,
                              output,
                              session,
                              model_path,
                              decimal=5,
                              onnx_shape=None)

            # each time in this loop double the number of rows
            test_data = test_data.union(test_data)

        results = pandas.DataFrame(
            data={
                'input_rec_count': rec_counts,
                'pyspark (ms)': spark_times,
                'onnxruntime (ms)': runtime_times
            })
        print(results)