def test_dct(self): data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )], ["vec"]) model = DCT(inverse=False, inputCol="vec", outputCol="resultVec") # the input name should match that of what inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml DCT', [('vec', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().resultVec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().vec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDCT") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['resultVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def DCTTransform(df, hiperparameter): ''' Transforms the input dataset with optional parameters. Parameters: dataset – input dataset, which is an instance of pyspark.sql.DataFrame params – an optional param map that overrides embedded params. Returns: transformed dataset with coloumn is inputCol and the output column is outputCol ''' dct = DCT(inverse=hiperparameter['inverse'], inputCol=hiperparameter['inputCol'], outputCol=hiperparameter['outputCol']) df_transformed = dct.transform(df) return df_transformed
# _*_ coding:utf-8 _*_ ''' Discrete Cosine Transform(DCT) ''' from pyspark.sql import SparkSession from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors spark = SparkSession.builder.appName("dct").getOrCreate() df = spark.createDataFrame([ (Vectors.dense([0.0, 1.0, -2.0, 3.0]),), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),), (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"]) dct=DCT(inverse=False,inputCol="features",outputCol="featuresDCT") dctDf=dct.transform(df) for dcts in dctDf.select("featuresDCT").take(3): print(dcts) dctDf.show()
# from __future__ import print_function # $example on$ from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("DCTExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (Vectors.dense([0.0, 1.0, -2.0, 3.0]),), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),), (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"]) dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT") dctDf = dct.transform(df) dctDf.select("featuresDCT").show(truncate=False) # $example off$ spark.stop()
from pyspark.ml.feature import DCT from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("DCT").master("local").getOrCreate() df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ), (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )], ["features"]) dct = DCT(inverse=False, inputCol="features", outputCol="FeaturesDCT") dct.transform(df).select("featuresDCT").show(truncate=False) spark.stop()
from pyspark.sql import SparkSession from pyspark.ml.linalg import Vectors from pyspark.ml.feature import DCT # The Discrete Cosine Transform transforms a length N real-valued sequence in # the time domain into another length N real-valued sequence in the frequency # domain. A DCT class provides this functionality, implementing the DCT-II and # scaling the result by sqrt(2) such that the representing matrix for the # transform is unitary. No shift is applied to the transformed sequence (e.g. # the 0th element of the transformed sequence is the 0th DCT coefficient and # not the N/2th). # PS: The obvious distinction between a DCT and a DFT is that the former uses # only cosine functions, while the latter uses both cosines and sines (in the # form of complex exponentials). spark = SparkSession.builder.appName("DCT").getOrCreate() df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ), (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )], ["features"]) dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT") dctDF = dct.transform(df) dctDF.select("featuresDCT").show(truncate=False)
from pyspark.ml.feature import DCT from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext("local", "samp") sqlContext = SQLContext(sc) df = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3, 0.0]), ), (Vectors.dense([1.0, 2.0, 3.0]), )], ["features"]) dct = DCT(inputCol="features", outputCol="DCTfeatures", inverse=False) dctmodel = dct.transform(df) dctmodel.select("DCTfeatures").show() """OUTPUT +--------------------+ | DCTfeatures| +--------------------+ |[0.17320508075688...| |[3.46410161513775...| +--------------------+"""