def scaling(dataFrame, inputColName, Min, Max): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=MinMaxScaler(inputCol="features", \ outputCol=outputColName) scaler.setMax(Max)\ .setMin(Min) scalerModel=scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' to range ({1:f}, {2:f}) and create a new column '{3:s}'."\ .format(inputColName,scaler.getMin(), scaler.getMax(), outputColName)) return scaledDF
class mmscaler_wrapper(): mmModel = '' originalMin = '' originalMax = '' def __init__(self, inputCol, outputCol, s_min=0, s_max=0): self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol) self.mmModel.setMin(s_min) self.mmModel.setMax(s_max) self.in_column = inputCol def get_input_col_name(self): return self.mmModel.getInputCol() def getMax(self): return self.mmModel.getMax() def getMin(self): return self.mmModel.getMin() def describe(self): print 'describe' def fit(self, df): col = self.mmModel.getInputCol() self.originalMin = df.select(col).rdd.flatMap(lambda x: x[0]).min() self.originalMax = df.select(col).rdd.flatMap(lambda x: x[0]).max() return self.mmModel.fit(df) #denormalize the value def denormalize(self, value): v = (value - self.getMin()) * (self.originalMax - self.originalMin) * ( self.getMax() - self.getMin()) + self.originalMin if v or v == 0: return v else: return -999 def denormalize_df(self, df): col = self.mmModel.getInputCol() def normalize(self, value): pass
1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() # COMMAND ---------- ###MaxAbsScaler (-1, 1) from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), (
from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("MinMaxScalerExample")\ .getOrCreate() # $example on$ dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop()
##################### ######################## ## RESCALING DATA SET ## ######################## # Typically for Neural Networks to perform better # a lot of preprocessing has to go into the data # So I scaled the feature space to have min = 0 and max = 1 scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') scalerModel = scaler.fit(df) scaledData = scalerModel.transform(df) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() new_df = scaledData.selectExpr("label", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst","features as oldFeature", "scaledFeatures as features")
## part 2 print('*' * 100) print('Part 2 - Normalize features between 0 and 1\n') # assemble features values into a vector and create a feature containing those vectors assembler = VectorAssembler().setInputCols( data.columns[1:]).setOutputCol('features') transformed = assembler.transform(data) # create scaler object, transform feature vectors and add scaledFeatures column scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') scalerModel = scaler.fit(transformed.select('features')) scaledData = scalerModel.transform(transformed) print('Features scaled to range: {} to {}'.format(scaler.getMin(), scaler.getMax())) # print(scaledData.select('_c0','features','scaledFeatures').show(10)) # limit dataset to label and scaled vectors scaledData = scaledData.select('_c0', 'scaledFeatures') # rename columns scaledData = scaledData.withColumnRenamed('_c0', 'label').withColumnRenamed( 'scaledFeatures', 'features') print(scaledData.show(5)) #################################################################################### ## part 3 print('*' * 100) print('Part 3 - \n')