def test_clear_param(self): df = self.spark.createDataFrame([(Vectors.dense([1.0]), ), (Vectors.dense([2.0]), )], ["a"]) maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") model = maScaler.fit(df) self.assertTrue(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol(), "scaled") model.clear(model.outputCol) self.assertFalse(model.isSet(model.outputCol)) self.assertEqual(model.getOutputCol()[:12], 'MaxAbsScaler') output = model.transform(df) self.assertEqual(model.getOutputCol(), output.schema.names[1])
def scaling(dataFrame, inputColName): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=MaxAbsScaler(inputCol="features", \ outputCol=outputColName) scalerModel=scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' to range (-1, 1) and create a new column '{1:s}'.".format(inputColName, outputColName)) return scaledDF
def standardScaler(self): from pyspark.ml.feature import StandardScaler dataFrame = self.session.read.format("libsvm").load( self.dataDir + "/data/mllib/sample_libsvm_data.txt") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scalerModel = scaler.fit(dataFrame) scaledData = scalerModel.transform(dataFrame) scaledData.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
def maxabs_scale(self, columns='*'): ''' rescale the columns by dividing by the max absolute value ''' if columns == "*": columns = self._df.schema.names else: assert isinstance(columns, list), "Error: columns argument must be a list!" for column in columns: outputcol = column + '_scaled' assembler = VectorAssembler(inputCols=[column], outputCol='features') df = assembler.transform(self._df) scaler = MaxAbsScaler(inputCol='features', outputCol=outputcol) df = scaler.fit(df).transform(df).drop('features') to_float = udf(lambda x: float(x[0])) self._df = df.withColumn(outputcol, to_float(outputcol)) return self._df
def test_maxabs_scaler(self): data = self.spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def get_scaler(self, option): """Set up scaler for dataset.""" if option == 'standard': scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) elif option == 'minmax': scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") elif option == 'maxabs': scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") else: scaler = None return scaler
def ml_pipeline_factory(inputCols, classifier, param_gird=None): """ Helper function to build a Spark ML pipeline based on a given classifier for the given feature names in the given DataFrame. Result is a `CrossValidator` that needs to be fitted with `.fit(df)` to the training/validation set. INPUT: - inputCols (list [string]): list of string names of the feature columns of `df` that shall be considered. - classifier: a classifier instance from pyspark.ml.classification - param_grid: a ParamGrid that was built for the passed classifier based on pyspark.ml.ParamGridBuilder OUTPUT: - result (CrossValidator): A Spark ML `CrossValidator`. """ # VectorAssembler vecAssembler = VectorAssembler(inputCols=inputCols, outputCol="features", handleInvalid='skip') # Normalizer / Scaler """ TODO Apply Standardization instead of scaling to account for outliers """ maScaler = MaxAbsScaler(inputCol="features", outputCol="features_scaled") # Define a pipeline pipe = Pipeline(stages=[vecAssembler, maScaler, classifier]) # Use cross-validation cv = CrossValidator(estimator=pipe, evaluator=MulticlassClassificationEvaluator( labelCol='churn', metricName='f1'), estimatorParamMaps=param_gird, numFolds=3, parallelism=4) return cv
def loadMaxAbsScaler(path): """ input: path return value: scaler [MaxAbsScaler] """ return MaxAbsScaler.load(path)
sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol( "features").setOutputCol("features_minmax_scaled") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features").setOutputCol( "features_MaxAbs_scaled") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ----------
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import MaxAbsScaler # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("MaxAbsScalerExample").getOrCreate() # $example on$ dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.show() # $example off$ spark.stop()
).transform(df) temp_normalized_vector_col = temp_col_name(assembled) trained_parameters = load_trained_parameters(trained_parameters, {"input_column": input_column,}) scaler_model, scaler_model_loaded = load_pyspark_model_from_trained_parameters( trained_parameters, MinMaxScalerModel, "scaler_model" ) if scaler_model is None: scaler = MinMaxScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) scaler_model = fit_and_save_model(trained_parameters, "scaler_model", scaler, assembled_wo_nans) output_df = transform_using_trained_model(scaler_model, assembled, scaler_model_loaded) scaler = MaxAbsScaler(inputCol=temp_vector_col, outputCol=temp_normalized_vector_col) output_df = scaler.fit(assembled_wo_nans).transform(assembled) # convert the resulting vector back to numeric temp_flattened_vector_col = temp_col_name(output_df) output_df = output_df.withColumn(temp_flattened_vector_col, vector_to_array(temp_normalized_vector_col)) # keep only the final scaled column. output_column = input_column if output_column is None or not output_column else output_column output_column_value = sf.col(temp_flattened_vector_col)[0].alias(output_column) output_df = output_df.withColumn(output_column, output_column_value) final_columns = list(dict.fromkeys((list(df.columns) + [output_column]))) output_df = output_df.select(final_columns) return default_spark_with_trained_parameters(output_df, trained_parameters)
#Precise daily perspective fc_5 = ["Hour", "Minute"] + [enc.getOutputCol() for enc in encoders] #Unprecise daily perspective fc_6 = ["Hour"] + [enc.getOutputCol() for enc in encoders] fcs = [fc_1, fc_2, fc_3, fc_4, fc_5, fc_6] #=========== END FC ===========# standard_scaler = StandardScaler(inputCol="Features", outputCol="scaledFeatures", withStd=False, withMean=True) min_max_scaler = MinMaxScaler(inputCol="Features", outputCol="scaledFeatures") max_abs_scaler = MaxAbsScaler(inputCol="Features", outputCol="scaledFeatures") norm_standard_scaler = StandardScaler(inputCol="normFeatures", outputCol="scaledFeatures", withStd=False, withMean=True) norm_min_max_scaler = MinMaxScaler(inputCol="normFeatures", outputCol="scaledFeatures") norm_max_abs_scaler = MaxAbsScaler(inputCol="normFeatures", outputCol="scaledFeatures") normalizer = Normalizer(inputCol="Features", outputCol="normFeatures") ######END PIPELINE from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier, DecisionTreeClassifier
sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show()
def process(self, data_input, data_output): """ An spark process to do feature engineering :param data_input: data input filename :param data_output: data output filename """ df = self.spark.read.parquet(data_input).select('SHP_DATE_CREATED_ID', 'SHP_DATETIME_CREATED_ID', 'SHP_DATE_HANDLING_ID', 'SHP_DATETIME_HANDLING_ID', 'SHP_SENDER_ID', 'SHP_ORDER_COST', 'CAT_CATEG_ID_L7', 'SHP_ADD_ZIP_CODE', 'SHP_DATE_SHIPPED_ID', 'SHP_DATETIME_SHIPPED_ID', 'HT_REAL') # 1. SHP_ORDER_COST_INT: Se tranforma la columna SHP_ORDER_COST de float a integer. df = df.withColumn("SHP_ORDER_COST_INT", (df["SHP_ORDER_COST"].cast(IntegerType()))) # 2. SHP_DAY: Se añade una columna para indicar que dia de la semana se acredito el pago. shp_day_udf = udf(self.shp_day, IntegerType()) df = df.withColumn('SHP_DAY', shp_day_udf(df['SHP_DATE_HANDLING_ID'])) # 3. WKND_DAY: Se añade una columna para indicar si el pago se acredito durante el fin de semana. weekend_day_udf = udf(self.weekend_day, IntegerType()) df = df.withColumn('WKND_DAY', weekend_day_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WKND_DAY').show(10) # 4. MONTH_NUM: Se añanade una columna para indicar el mes del pago. week_number_udf = udf(self.week_number, IntegerType()) df = df.withColumn('WK_NUM', week_number_udf(df['SHP_DATE_HANDLING_ID'])) df.select('WK_NUM').show(10) # 5. *WK_NUM*: Se añanade una columna para indicar la semana del año en la que se realizó el pago. month_number_udf = udf(self.month_number, IntegerType()) df = df.withColumn('MONTH_NUM', month_number_udf(df['SHP_DATE_HANDLING_ID'])) # 6. *TIMESTAMP*: Se añanade un TIMESTAMP de las fechas. get_timestamp_udf = udf(self.get_timestamp, IntegerType()) df = df.withColumn('SHP_DATE_HANDLING_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_HANDLING_ID'])) df = df.withColumn('SHP_DATE_CREATED_TIMESTAMP', get_timestamp_udf(df['SHP_DATE_CREATED_ID'])) my_handling_time_udf = udf(self.my_handling_time, IntegerType()) df = df.withColumn('HT', my_handling_time_udf(array('SHP_DATETIME_SHIPPED_ID', 'SHP_DATETIME_HANDLING_ID'))) shp_sender_indexer = StringIndexer(inputCol="SHP_SENDER_ID", outputCol="SHP_SENDER_ID_NUM").fit(df) df = shp_sender_indexer.transform(df) shp_sender_indexer = StringIndexer(inputCol="CAT_CATEG_ID_L7", outputCol="CAT_CATEG_ID_L7_NUM").fit(df) df = shp_sender_indexer.transform(df) #create the vector assembler vec_assembler = VectorAssembler(inputCols=['SHP_DATE_HANDLING_TIMESTAMP', 'SHP_DATE_CREATED_TIMESTAMP','SHP_SENDER_ID_NUM', 'CAT_CATEG_ID_L7_NUM', 'SHP_ORDER_COST_INT', 'SHP_DAY', 'WKND_DAY', 'WK_NUM', 'MONTH_NUM', 'SHP_ADD_ZIP_CODE'], outputCol='features') #transform the values features_df = vec_assembler.transform(df) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(features_df) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(features_df) #Save dataset as parquet scaledData.write.format("parquet").mode('overwrite').option("header", "true").save(data_output)
dfAmazon = dfJoin.select('*').where(dfJoin.company == 'AMAZON') dfGoogle = dfJoin.select('*').where(dfJoin.company == 'GOOGLE') dfNetflix = dfJoin.select('*').where(dfJoin.company == 'NETFLIX') dfSnapchat = dfJoin.select('*').where(dfJoin.company == 'SNAPCHAT') dfMicrosoft = dfJoin.select('*').where(dfJoin.company == 'MICROSOFT') dfFacebook.describe().toPandas().transpose() display(dfFacebook) # COMMAND ---------- #Feature scaling using MaxAbsScaler vectorAssembler = VectorAssembler( inputCols=['avg-sentiment', 'avg-followers', 'avg-volume'], outputCol='features') v_dffacebook = vectorAssembler.transform(dfFacebook) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(v_dffacebook) scaledData = scalerModel.transform(v_dffacebook) scaledData.select("features", "scaledFeatures").show() v_dffacebook1 = scaledData.select(['features', 'scaledFeatures', 'avg-close']) v_dffacebook1.show() # COMMAND ---------- #Train test split train_df, test_df = v_dffacebook1.randomSplit([0.8, 0.2]) # COMMAND ---------- #Linear Regression model lr = LinearRegression(featuresCol='features', labelCol='avg-close', maxIter=10)
###MaxAbsScaler (-1, 1) from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), ( 2, Vectors.dense([4.0, 10.0, 8.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show() # COMMAND ---------- #####Bucketizer transform the continuous features into columns of feature bucket, by defining the size of the bucket from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
def scaler(input_features): scaler = MaxAbsScaler(inputCol="raw_features", outputCol="features") scalerModel = scaler.fit(input_features) scaledData = scalerModel.transform(input_features).drop("raw_features") #scaledData.show(3) return scaledData
@author: luogan """ from pyspark.ml.feature import MaxAbsScaler from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark= SparkSession\ .builder \ .appName("dataFrame") \ .getOrCreate() dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), ( 2, Vectors.dense([4.0, 10.0, 8.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MaxAbsScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show()
dfJoin = dfJoin.withColumnRenamed("avg(volume)","avg-volume") dfJoin = dfJoin.withColumnRenamed("avg(followers_count)","avg-followers") dfJoin.show() # COMMAND ---------- from pyspark.ml.feature import VectorAssembler dfJoin1 = dfJoin.select("avg-sentiment","avg-followers","avg-volume") inputFeatures = ["avg-sentiment","avg-followers","avg-volume"] assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features") dfJoin2 = assembler.transform(dfJoin1) # COMMAND ---------- # Scaling features scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(dfJoin2) scaledData = scalerModel.transform(dfJoin2) scaledData.select("features", "scaledFeatures").show() # COMMAND ---------- #Elbow method import numpy as np cost = np.zeros(10) for k in range(2,10): kmeans = KMeans().setK(k).setFeaturesCol("scaledFeatures").setPredictionCol("prediction").setMaxIter(1).setSeed(1) model = kmeans.fit(scaledData) cost[k] = model.computeCost(scaledData) # COMMAND ----------