# feature scaling for numeric features featuresForScale = [x for x in mlSourceDFCat.columns if 'Lag' in x] print(len(featuresForScale)) assembler = VectorAssembler( inputCols=featuresForScale, outputCol="features" ) assembled = assembler.transform(mlSourceDFCat).select(col('key'), col('features')) scaler = StandardScaler( inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False ).fit(assembled) scaler.write().overwrite().save(featureScaleModelFile) scaledData = scaler.transform(assembled).select('key','scaledFeatures') def extract(row): return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values) rdd = scaledData.rdd.map(lambda x: Row(key=x[0],scaledFeatures=DenseVector(x[1].toArray()))) scaledDf = rdd.map(extract).toDF(["key"]) # rename columns oldColumns = scaledDf.columns scaledColumns = ['scaledKey'] scaledColumns.extend(['scaled'+str(i) for i in featuresForScale]) scaledOutcome = scaledDf.select([col(oldColumns[index]).alias(scaledColumns[index]) for index in range(0,len(oldColumns))]) noScaledMLSourceDF = mlSourceDFCat.select([column for column in mlSourceDFCat.columns if column not in featuresForScale]) newDF = noScaledMLSourceDF.join(scaledOutcome, noScaledMLSourceDF.key==scaledOutcome.scaledKey, 'outer') newDF.cache()