示例#1
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"])

        sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
示例#2
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
# COMMAND ----------

###Vector size hint takes the size of the input dataframe and transform the vector size with the given size hint
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
     (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

sizeHint = VectorSizeHint(inputCol="userFeatures",
                          handleInvalid="skip",
                          size=3)

datasetWithSize = sizeHint.transform(dataset)
print("Rows where 'userFeatures' is not the right size are filtered out")
datasetWithSize.show(truncate=False)

assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                            outputCol="features")

# This dataframe can be used by downstream transformers as before
output = assembler.transform(datasetWithSize)
print(
    "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
)
output.select("features", "clicked").show(truncate=False)

# COMMAND ----------
        .builder\
        .appName("VectorSizeHintExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame(
        [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
         (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
        ["id", "hour", "mobile", "userFeatures", "clicked"])

    sizeHint = VectorSizeHint(
        inputCol="userFeatures",
        handleInvalid="skip",
        size=3)

    datasetWithSize = sizeHint.transform(dataset)
    print("Rows where 'userFeatures' is not the right size are filtered out")
    datasetWithSize.show(truncate=False)

    assembler = VectorAssembler(
        inputCols=["hour", "mobile", "userFeatures"],
        outputCol="features")

    # This dataframe can be used by downstream transformers as before
    output = assembler.transform(datasetWithSize)
    print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(truncate=False)
    # $example off$

    spark.stop()
示例#5
0
dataset = dataset.withColumn(
    'categorical',
    F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'),
             F.array('tac'), F.array('snr')))

word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path)
word2Vec = Word2VecModel.load(word2Vec_output_path)
dataset = word2Vec.transform(dataset)

# VectorAssembler

sizeHint = VectorSizeHint(inputCol="vcategorical",
                          handleInvalid="skip",
                          size=50)
dataset = sizeHint.transform(dataset)

vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format(
    base_path)
vector_assembler = VectorAssembler.load(vector_assembler_output_path)
dataset = vector_assembler.transform(dataset)

# Clasificación

model_path = "{}/data/distanceKmeansRmW2VModel.bin".format(base_path)
model = KMeansModel.load(model_path)
predictions = model.transform(dataset)

centers = model.clusterCenters()

vectorCent = F.udf(lambda k: centroid(k, centers), ArrayType(DoubleType()))