示例#1
0
    def test_dct(self):
        data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )],
                                          ["vec"])
        model = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
        # the input name should match that of what inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml DCT',
            [('vec', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().resultVec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().vec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlDCT")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['resultVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
示例#2
0
def DCTTransform(df, hiperparameter):
    '''
    Transforms the input dataset with optional parameters.
    Parameters:	
        dataset – input dataset, which is an instance of pyspark.sql.DataFrame
        params – an optional param map that overrides embedded params.
    Returns:	
        transformed dataset with coloumn is inputCol and the output column is outputCol
    '''
    dct = DCT(inverse=hiperparameter['inverse'],
              inputCol=hiperparameter['inputCol'],
              outputCol=hiperparameter['outputCol'])
    df_transformed = dct.transform(df)
    return df_transformed
示例#3
0
def df_train_test():
    df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet"))
    df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet"))
    cols = ["vx"+str(i) for i in range(10)]
    assembler = VectorAssembler(inputCols=cols, outputCol="vx_t1")
    dct = DCT(inverse=False, inputCol="vx_t1", outputCol="vx_t2")
    slicer = VectorSlicer(inputCol="vx_t2", outputCol="vx_t3", indices=[i for i in range(40000)])
    scaler = StandardScaler(inputCol="vx_t3", outputCol="vx", withStd=True, withMean=False)

    pipeline = Pipeline(stages=[assembler, dct, slicer, scaler])
    p_model = pipeline.fit(df_train)

    df_train = p_model.transform(df_train)
    df_train = df_train.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4")
    df_train = df_train.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9")
    df_train = df_train.drop("vx_t1").drop("vx_t2").drop("vx_t3")

    df_test = p_model.transform(df_test)
    df_test = df_test.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4")
    df_test = df_test.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9")
    df_test = df_test.drop("vx_t1").drop("vx_t2").drop("vx_t3")

    df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.vector.dct.parquet"))
    df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.vector.dct.parquet"))

    df_train.printSchema()
    df_test.printSchema()
示例#4
0
# _*_ coding:utf-8 _*_

'''
Discrete Cosine Transform(DCT)
'''

from pyspark.sql import SparkSession
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

spark = SparkSession.builder.appName("dct").getOrCreate()

df = spark.createDataFrame([
    (Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
    (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
    (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

dct=DCT(inverse=False,inputCol="features",outputCol="featuresDCT")

dctDf=dct.transform(df)

for dcts in dctDf.select("featuresDCT").take(3):
	print(dcts)

dctDf.show()
示例#5
0
def dct (dataset,inputCol, inverse = False):
    from pyspark.ml.feature import DCT
    return DCT(inverse=inverse, inputCol=inputCol, outputCol=inputCol+'_dct').transform(dataset)
示例#6
0
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("DCTExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
        (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
        (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

    dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")

    dctDf = dct.transform(df)

    dctDf.select("featuresDCT").show(truncate=False)
    # $example off$

    spark.stop()
示例#7
0
def vec_dct_to_win():
    df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet"))
    df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet"))
    for j in range(8):
    cols = ["vx"+str(j+i) for i in range(3)]
    assembler = VectorAssembler(inputCols=cols, outputCol="vx_w"+str(j))
    dct = DCT(inverse=False, inputCol="vx_w"+str(j), outputCol="fr_w"+str(j))
    slicer = VectorSlicer(inputCol="fr_w"+str(j), outputCol="fs_w"+str(j), indices=[i for i in range(12000)])
    scaler = StandardScaler(inputCol="fs_w"+str(j), outputCol="fn_w"+str(j), withStd=True, withMean=False)

    pipeline = Pipeline(stages=[assembler, dct, slicer, scaler])
    pw_model = pipeline.fit(df_train)    

    df_train = pw_model.transform(df_train).drop("vx"+str(j)).drop("vx_w"+str(j)).drop("fr_w"+str(j)).drop("fs_w"+str(j)) 
    df_test = pw_model.transform(df_test).drop("vx"+str(j)).drop("vx_w"+str(j)).drop("fr_w"+str(j)).drop("fs_w"+str(j))  

    df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.vector.dct.parquet"))
    df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.win.vector.dct.parquet"))

    df_train.printSchema()
    df_test.printSchema()

def bin_win_source_to(source, destination):
    df_w = spark.read.parquet(os.path.join("datasets", source))
    for j in range(8):
        rdd_w = df_w.rdd.map(lambda row, j=j:(row.seg, row["fn_w"+str(j)], []))
        for i in range(0,99):
            rdd_w = rdd_w.map(lambda row, i=i:(row[0], row[1], row[2] + [float(sum(abs(row[1].toArray()[i*120:i*120+240]))/120)]))
        rdd_w = rdd_w.map(lambda row: Row(seg=row[0],f=Vectors.dense(row[2])))
        df_tmp = rdd_w.toDF()
        df_tmp = df_tmp.selectExpr("seg AS seg2", "f AS f"+str(j)).drop("seg").drop("_c0")
        df_w = df_w.join(df_tmp, df_w.seg.cast(IntegerType()) == df_tmp.seg2.cast(IntegerType())).drop("seg2").drop("fn_w"+str(j))
    df_w = df_w.drop("vx8").drop("vx9")
    df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination))

def slice_win_source_to(source, destination):
    df_w = spark.read.parquet(os.path.join("datasets", source))
    for j in range(8):
        slicer = VectorSlicer(inputCol="f"+str(j), outputCol="f_sl"+str(j), indices=[i for i in range(50,76)])
        df_w = slicer.transform(df_w).drop("f"+str(j))
    cols = ["f_sl"+str(i) for i in range(8)]
    assembler = VectorAssembler(inputCols=cols, outputCol="f")
    df_w = assembler.transform(df_w) 
    df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination))
    df_w.printSchema()

def label_win_to():
    df_train = spark.read.parquet(os.path.join("datasets", "train.parquet"))
    df_train.createOrReplaceTempView("data")
    df_target = spark.sql("""
    SELECT d0.seg, d0.y AS y0, d1.y AS y1, d2.y AS y2, d3.y AS y3, 
                d4.y AS y4, d5.y AS y5, d6.y AS y6, d7.y AS y7
    FROM        data AS d0 
    INNER JOIN  data AS d1 ON d1.no =  60000 AND d1.seg = d0.seg
    INNER JOIN  data AS d2 ON d2.no =  75000 AND d2.seg = d0.seg
    INNER JOIN  data AS d3 ON d3.no =  90000 AND d3.seg = d0.seg
    INNER JOIN  data AS d4 ON d4.no = 105000 AND d4.seg = d0.seg
    INNER JOIN  data AS d5 ON d5.no = 120000 AND d5.seg = d0.seg
    INNER JOIN  data AS d6 ON d6.no = 135000 AND d6.seg = d0.seg
    INNER JOIN  data AS d7 ON d7.no = 150000 AND d7.seg = d0.seg
    WHERE d0.no = 45000
    ORDER BY d0.seg
    """ )
    df_target.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.target.parquet"))
    df_target.show()

def stats_feature_win_to():
    df_train = spark.read.parquet(os.path.join("datasets", "train.parquet"))
    df_train.createOrReplaceTempView("data")
    df_stat = spark.sql("""
    SELECT seg,
        INT(no/1000) AS seq,
        AVG(x) AS x_avg,
        PERCENTILE(x, 0.02) AS x_p02,
        PERCENTILE(x, 0.98) AS x_p98,
        PERCENTILE(ABS(x), 0.95) AS xa_p95
    FROM data
    GROUP BY seg, INT(no/1000)
    ORDER BY seg, INT(no/1000)
    """ )
    df_agg = sc.parallelize([])
    rdd_temp = df_stat.rdd.map(lambda row:(row.seg, row.x_avg, row.x_p02, row.x_p98, row.xa_p95))      \
        .map(lambda data: (data[0], ([ data[1] ], [ data[2] ] , [ data[3] ] , [ data[4] ] )))          \
        .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3])  )              \
        .map(lambda row: Row(seg=row[0],
                            stx1=Vectors.dense(row[1][0]),
                            stx2=Vectors.dense(row[1][1]),
                            stx3=Vectors.dense(row[1][2]),
                            stx4=Vectors.dense(row[1][3])))      
    # if df_agg.count() == 0: 
    df_agg = rdd_temp.toDF(["seg","stx1","stx2","stx3","stx4"])

    # df_agg.show()
    df_agg = df_agg.select("*").where("seg != 4194")
    scaler = StandardScaler(inputCol="stx1", outputCol="stxn1", withStd=True, withMean=False)
    scalerModel = scaler.fit(df_agg)
    df_agg = scalerModel.transform(df_agg).drop("stx1")

    scaler = StandardScaler(inputCol="stx2", outputCol="stxn2", withStd=True, withMean=False)
    scalerModel = scaler.fit(df_agg)
    df_agg = scalerModel.transform(df_agg).drop("stx2")

    scaler = StandardScaler(inputCol="stx3", outputCol="stxn3", withStd=True, withMean=False)
    scalerModel = scaler.fit(df_agg)
    df_agg = scalerModel.transform(df_agg).drop("stx3")

    scaler = StandardScaler(inputCol="stx4", outputCol="stxn4", withStd=True, withMean=False)
    scalerModel = scaler.fit(df_agg)
    df_agg = scalerModel.transform(df_agg).drop("stx4")

    df_agg.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.stat.parquet"))


vec_dct_to_win()
print("vec_dct_to_win finish!!!!!!!!!!")
bin_win_source_to("train.win.vector.dct.parquet", "train.win.vector.fbin.parquet")
print("bin_win_source_to train finish!!!!")
bin_win_source_to("test.win.vector.dct.parquet", "test.win.vector.fbin.parquet")
print("bin_win_source_to test finish!!!!")
slice_win_source_to("train.win.vector.fbin.parquet", "train.win.vector.fbin.2.parquet")
print("slice_win_source_to train finish!!!!!!!")
slice_win_source_to("test.win.vector.fbin.parquet", "test.win.vector.fbin.2.parquet")
print("slice_win_source_to test finish!!!!!!!")
label_win_to()
print("label_win_to finish!!!!!!!")
stats_feature_win_to()
print("stats_feature_win_to finish!!!!!!!!!!!")
示例#8
0
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

if __name__ == "__main__":

    spark = SparkSession.builder.appName("DCT").master("local").getOrCreate()

    df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ),
                                (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ),
                                (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )],
                               ["features"])

    dct = DCT(inverse=False, inputCol="features", outputCol="FeaturesDCT")

    dct.transform(df).select("featuresDCT").show(truncate=False)

    spark.stop()
示例#9
0
from pyspark.ml.feature import DCT
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext("local", "samp")
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame([(Vectors.dense([-2.0, 2.3, 0.0]), ),
                                 (Vectors.dense([1.0, 2.0, 3.0]), )],
                                ["features"])
dct = DCT(inputCol="features", outputCol="DCTfeatures", inverse=False)
dctmodel = dct.transform(df)
dctmodel.select("DCTfeatures").show()
"""OUTPUT
+--------------------+
|         DCTfeatures|
+--------------------+
|[0.17320508075688...|
|[3.46410161513775...|
+--------------------+"""