def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint(inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
def test_vector_size_hint(self): df = self.spark.createDataFrame( [(0, Vectors.dense([0.0, 10.0, 0.5])), (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])), (2, Vectors.dense([2.0, 12.0]))], ["id", "vector"]) sizeHint = VectorSizeHint( inputCol="vector", handleInvalid="skip") sizeHint.setSize(3) self.assertEqual(sizeHint.getSize(), 3) output = sizeHint.transform(df).head().vector expected = DenseVector([0.0, 10.0, 0.5]) self.assertEqual(output, expected)
# COMMAND ---------- ###Vector size hint takes the size of the input dataframe and transform the vector size with the given size hint from pyspark.ml.linalg import Vectors from pyspark.ml.feature import (VectorSizeHint, VectorAssembler) dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint(inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print( "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'" ) output.select("features", "clicked").show(truncate=False) # COMMAND ----------
.builder\ .appName("VectorSizeHintExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame( [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0), (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)], ["id", "hour", "mobile", "userFeatures", "clicked"]) sizeHint = VectorSizeHint( inputCol="userFeatures", handleInvalid="skip", size=3) datasetWithSize = sizeHint.transform(dataset) print("Rows where 'userFeatures' is not the right size are filtered out") datasetWithSize.show(truncate=False) assembler = VectorAssembler( inputCols=["hour", "mobile", "userFeatures"], outputCol="features") # This dataframe can be used by downstream transformers as before output = assembler.transform(datasetWithSize) print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop()
dataset = dataset.withColumn( 'categorical', F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'), F.array('tac'), F.array('snr'))) word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path) word2Vec = Word2VecModel.load(word2Vec_output_path) dataset = word2Vec.transform(dataset) # VectorAssembler sizeHint = VectorSizeHint(inputCol="vcategorical", handleInvalid="skip", size=50) dataset = sizeHint.transform(dataset) vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_output_path) dataset = vector_assembler.transform(dataset) # Clasificación model_path = "{}/data/distanceKmeansRmW2VModel.bin".format(base_path) model = KMeansModel.load(model_path) predictions = model.transform(dataset) centers = model.clusterCenters() vectorCent = F.udf(lambda k: centroid(k, centers), ArrayType(DoubleType()))