# real example users_noscaled=users_addedmonths from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import MinMaxScaler #call the vector assembler assembler = VectorAssembler( inputCols=users_noscaled.columns[7:], outputCol='assembled_col' ) #call the scaler scaler = MinMaxScaler( inputCol="assembled_col", outputCol="assembled_col_norm" ) #build an assembleed vector in the dataframe assembled=assembler.transform(users_noscaled) #build the scaler model scaler_model= scaler.fit(assembled) #Apply the model to the transformed dataframe users_wscaled=scaler_model.transform(assembled)
def scaleVecCol(self, columns, nameOutputCol): """ This function groups the columns specified and put them in a list array in one column, then a scale process is made. The scaling proccedure is spark scaling default (see the example bellow). +---------+----------+ |Price |AreaLiving| +---------+----------+ |1261706.9|16 | |1263607.9|16 | |1109960.0|19 | |978277.0 |19 | |885000.0 |19 | +---------+----------+ | | | V +----------------------------------------+ |['Price', 'AreaLiving'] | +----------------------------------------+ |[0.1673858972637624,0.5] | |[0.08966137157852398,0.3611111111111111]| |[0.11587093205757598,0.3888888888888889]| |[0.1139820728616421,0.3888888888888889] | |[0.12260126542983639,0.4722222222222222]| +----------------------------------------+ only showing top 5 rows """ # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) # Check if nameOutputCol argument a string datatype: self.__assertTypeStr(nameOutputCol, "nameOutpuCol") # Model to use vectorAssember: vecAssembler = VectorAssembler(inputCols=columns, outputCol="features_assembler") # Model for scaling feature column: mmScaler = MinMaxScaler(inputCol="features_assembler", outputCol=nameOutputCol) # Dataframe with feature_assembler column tempDF = vecAssembler.transform(self.__df) # Fitting scaler model with transformed dataframe model = mmScaler.fit(tempDF) exprs = list(filter(lambda x: x not in columns, self.__df.columns)) exprs.extend([nameOutputCol]) self.__df = model.transform(tempDF).select(*exprs) self.__addTransformation() # checkpoint in case return self
###columns sparkDF.columns # UDF for converting column type from vector to double type unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType()) assembler = VectorAssembler(inputCols=['_c78'],outputCol = "hell_Vect") newDf = assembler.transform(sparkDF) scaler = MinMaxScaler(inputCol="hell_Vect", outputCol="_Scaled") scalerModel = scaler.fit(newDf) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(newDf) def normaliseEntireDf(sparkDf): origColumns = sparkDf.columns for i in origColumns: # VectorAssembler Transformation - Converting column to vector type assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect") # MinMaxScaler Transformation scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
from pyspark.ml.feature import QuantileDiscretizer bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\
def train_scaler(df, inputCol, outputCol): scaler = MinMaxScaler(inputCol=inputCol, outputCol=outputCol) return scaler.fit(df)
dataFrame = spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -8.0]), ), ( 1, Vectors.dense([2.0, 1.0, -4.0]), ), ( 2, Vectors.dense([4.0, 10.0, 8.0]), )], ["id", "features"]) dataFrame.show() scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show(10, False) # $example off$ scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame)
''' ##################### ## NEURAL NETWORKS ## ##################### ######################## ## RESCALING DATA SET ## ######################## # Typically for Neural Networks to perform better # a lot of preprocessing has to go into the data # So I scaled the feature space to have min = 0 and max = 1 scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') scalerModel = scaler.fit(df) scaledData = scalerModel.transform(df) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() new_df = scaledData.selectExpr("label", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst",
def MLClassifierDFPrep(df, input_columns, dependent_var, treat_outliers=True, treat_neg_values=True): # change label (class variable) to string type to prep for reindexing # Pyspark is expecting a zero indexed integer for the label column. # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method renamed = df.withColumn("label_str", df[dependent_var].cast( StringType())) #Rename and change to string type indexer = StringIndexer( inputCol="label_str", outputCol="label") #Pyspark is expecting the this naming convention indexed = indexer.fit(renamed).transform(renamed) print(indexed.groupBy("class", "label").count().show(100)) # Convert all string type data in the input column list to numeric # Otherwise the Algorithm will not be able to process it numeric_inputs = [] string_inputs = [] for column in input_columns: if str(indexed.schema[column].dataType) == 'StringType': indexer = StringIndexer(inputCol=column, outputCol=column + "_num") indexed = indexer.fit(indexed).transform(indexed) new_col_name = column + "_num" string_inputs.append(new_col_name) else: numeric_inputs.append(column) if treat_outliers == True: print("We are correcting for non normality now!") # empty dictionary d d = {} # Create a dictionary of quantiles for col in numeric_inputs: d[col] = indexed.approxQuantile( col, [0.01, 0.99], 0.25 ) #if you want to make it go faster increase the last number #Now fill in the values for col in numeric_inputs: skew = indexed.agg(skewness( indexed[col])).collect() #check for skewness skew = skew[0][0] # This function will floor, cap and then log+1 (just in case there are 0 values) if skew > 1: indexed = indexed.withColumn( col, log( when(df[col] < d[col][0], d[col][0]).when( indexed[col] > d[col][1], d[col][1]).otherwise( indexed[col]) + 1).alias(col)) print( col + " has been treated for positive (right) skewness. (skew =)", skew, ")") elif skew < -1: indexed = indexed.withColumn( col, exp( when(df[col] < d[col][0], d[col][0]).when( indexed[col] > d[col][1], d[col][1]).otherwise(indexed[col])).alias(col)) print( col + " has been treated for negative (left) skewness. (skew =", skew, ")") # Produce a warning if there are negative values in the dataframe that Naive Bayes cannot be used. # Note: we only need to check the numeric input values since anything that is indexed won't have negative values minimums = df.select([ min(c).alias(c) for c in df.columns if c in numeric_inputs ]) # Calculate the mins for all columns in the df min_array = minimums.select(array(numeric_inputs).alias( "mins")) # Create an array for all mins and select only the input cols df_minimum = min_array.select(array_min( min_array.mins)).collect() # Collect golobal min as Python object df_minimum = df_minimum[0][0] # Slice to get the number itself features_list = numeric_inputs + string_inputs assembler = VectorAssembler(inputCols=features_list, outputCol='features') output = assembler.transform(indexed).select('features', 'label') # final_data = output.select('features','label') #drop everything else # Now check for negative values and ask user if they want to correct that? if df_minimum < 0: print(" ") print( "WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values" ) print(" ") if treat_neg_values == True: print( "You have opted to correct that by rescaling all your features to a range of 0 to 1" ) print(" ") print("We are rescaling you dataframe....") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(output) # rescale each feature to range [min, max]. scaled_data = scalerModel.transform(output) final_data = scaled_data.select( 'label', 'scaledFeatures') # added class to the selection final_data = final_data.withColumnRenamed('scaledFeatures', 'features') print("Done!") else: print( "You have opted not to correct that therefore you will not be able to use to Naive Bayes classifier" ) print("We will return the dataframe unscaled.") final_data = output return final_data
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:]))) df_input = spark.createDataFrame(input_data, ["label", "features"]) df_input # COMMAND ---------- # MAGIC %md ## split data into training and test data # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler # Initialize the `standardScaler` scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = scaler.fit(df_input) # COMMAND ---------- # Transform the data in `df` with the scaler scaled_df = scaler.transform(df_input) scaled_df.first() # COMMAND ---------- train_data, test_data = scaled_df.randomSplit([.8, .2], seed=7) from pyspark.ml.classification import RandomForestClassifier, LogisticRegression # Initialize `lr` lr = LogisticRegression(labelCol="label",
data.printSchema() data.head() closeDF = data.select("close") from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=["close"], outputCol="features") closeAss = assembler.transform(closeDF) closeAss.show(40) from pyspark.ml.feature import StandardScaler from pyspark.ml.feature import MinMaxScaler MinMaxScalerizer = MinMaxScaler().setMin(0).setMax(100).setInputCol( "features").setOutputCol("MinMax_Scaled_features") input_data = MinMaxScalerizer.fit(closeAss).transform(closeAss).select( "MinMax_Scaled_features").collect() l = len(input_data) import tensorflow.compat.v1 as tf tf.disable_v2_behavior() batch_Size, window_Size, hidden_layer, learning_rate, epochs, clip_margin = 50, 50, 256, 0.001, 200, 4 inputs = tf.placeholder(tf.float32, [batch_Size, window_Size, 1]) targets = tf.placeholder(tf.float32, [batch_Size, 1]) def create_input(): X = [] Y = [] i = 0 while (i + window_Size) <= len(input_data) - 1:
spark = SparkSession \ .builder \ .appName("KMeans") \ .config("spark.some.config.option", "Angadpreet-KMeans") \ .getOrCreate() today = dt.datetime.today() spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_user.json").select( "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[ 0], x[1], (today - par.parse(x[2])).days)).collect()[:1700]) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") # Getting the input data trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: Vectors.dense(x)) # Initialize GMM gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018) df = pandas.DataFrame({'features': [], 'cluster': []}) i = 0 for v in vector_df.collect(): df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))] i += 1 print df df_with = spark.createDataFrame(
# Filtering emp_mgr_df = emp_df.filter("salary >= 100000") # print(emp_mgr_df.count()) # Choosing one column print(emp_mgr_df.select("salary").show()) # Data transformations # Normalization from pyspark.ml.feature import MinMaxScaler feature_scaler = MinMaxScaler(inputCol="features", outputCol="normalized_features") normalized_model = feature_scaler.fit(dataset=features_df) normalized_features_df = normalized_model.transform(features_df) print(normalized_features_df.take(1)) # Standardization from pyspark.ml.feature import StandardScaler feature_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) std_model = feature_scaler.fit(features_df) scaled_feature_df = std_model.transform(features_df) print(scaled_feature_df.take(1)) # Bucketing
def scale(df, feature_name): scaler = MinMaxScaler(inputCol="FeatureVector_unscaled_" + feature_name, outputCol="FeatureVector_" + feature_name) df = scaler.fit(df).transform(df) return df
print('\nHere are the first 40 instances:\n\n') print(data.show(40)) #################################################################################### ## part 2 print('*' * 100) print('Part 2 - Normalize features between 0 and 1\n') # assemble features values into a vector and create a feature containing those vectors assembler = VectorAssembler().setInputCols( data.columns[1:]).setOutputCol('features') transformed = assembler.transform(data) # create scaler object, transform feature vectors and add scaledFeatures column scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures') scalerModel = scaler.fit(transformed.select('features')) scaledData = scalerModel.transform(transformed) print('Features scaled to range: {} to {}'.format(scaler.getMin(), scaler.getMax())) # print(scaledData.select('_c0','features','scaledFeatures').show(10)) # limit dataset to label and scaled vectors scaledData = scaledData.select('_c0', 'scaledFeatures') # rename columns scaledData = scaledData.withColumnRenamed('_c0', 'label').withColumnRenamed( 'scaledFeatures', 'features') print(scaledData.show(5)) ####################################################################################
# ### Scaling # In[60]: #Applying Min-Max scaling from pyspark.ml.feature import MinMaxScaler mm_scaler = MinMaxScaler(inputCol="features", outputCol="minmax_scaled_features") # In[61]: mm = mm_scaler.fit(df_vect) df_scale = mm.transform(df_vect) df_scale.select("minmax_scaled_features", "success_failure").limit(5).toPandas() # ### Divinding the dataset # In[62]: df_train, df_test = df_scale.randomSplit(weights=[0.7, 0.3], seed=1) print("Number of observation in train-",df_train.count()) print("Number of observation in test-",df_test.count()) #df_train.count(), df_test.count()
row["sflow_bpackets"], row["sflow_bbytes"], row["fpsh_cnt"], row["bpsh_cnt"], row["furg_cnt"], row["burg_cnt"], row["total_fhlen"], row["total_bhlen"], row["dscp"] ])) return obj fluxoRDD4 = fluxoDF.rdd.map(transformaVar) fluxoDF = spSession.createDataFrame(fluxoRDD4, ["rotulo", "atributos"]) scaler = MinMaxScaler(inputCol="atributos", outputCol="scaledFeatures", min=0.0, max=1.0) scalerModel = scaler.fit(fluxoDF) scaledData = scalerModel.transform(fluxoDF) # Indexação é pré-requisito para Decision Trees stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed") si_model = stringIndexer.fit(scaledData) obj_final = si_model.transform(scaledData) X = np.array(obj_final.select("scaledFeatures").collect()) y = np.array(obj_final.select("indexed").collect()) #mudar a dimensão da matriz de atributos para 2d nsamples, nx, ny = X.shape d2_X = X.reshape((nsamples, nx * ny)) # Criando o modelo
print("# Correlation matrix:") print("###########################################") print(df_covid_count.corr()) ########################################### # Prepare spark model: ########################################### df_clean = spark.createDataFrame(df_covid_count) assembler = VectorAssembler().setInputCols( ["COVID_COUNT", "FAVS_PER_TWEET", "RT_PER_TWEET", "TWEETS_PER_HOUR"]).setOutputCol("IND_VARS") df_clean_assmbl = assembler.transform(df_clean) scaler = MinMaxScaler(inputCol="IND_VARS", outputCol="SCALED_IND_VARS") scaler_model = scaler.fit(df_clean_assmbl.select("IND_VARS")) scaled_data = scaler_model.transform(df_clean_assmbl) #scaled_data.show(3) # split data splits = scaled_data.randomSplit([0.7, 0.3], 1) df_train = splits[0] df_test = splits[1] # LR model: lr = LinearRegression(featuresCol="SCALED_IND_VARS", labelCol="MEAN_SENT_POLARITY", maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(df_train)
nb_classify(training, testing, training.schema.names[0], training.schema.names[1]) rf_classify(training, testing, training.schema.names[0], training.schema.names[1]) knn_classify(training, testing, training.schema.names[0], training.schema.names[1]) end_time = datetime.datetime.now() time_take = int((end_time - start_time).total_seconds()) print("time taken: ", time_take, " seconds") print() # training PCA features print("------ result of PCA features ------") scaler = MinMaxScaler(inputCol=training.schema.names[2], outputCol="scaledPCAFeatures") scalerModel = scaler.fit(training) training = scalerModel.transform(training) testing = scalerModel.transform(testing) # naive bayes cant deal with negative input features we skip PCA features here start_time = datetime.datetime.now() nb_classify(training, testing, training.schema.names[0], "scaledPCAFeatures") rf_classify(training, testing, training.schema.names[0], training.schema.names[2]) knn_classify(training, testing, training.schema.names[0], training.schema.names[2]) end_time = datetime.datetime.now() time_take = int((end_time - start_time).total_seconds()) print("time taken: ", time_take, " seconds")
rfm_seg = rfm_seg.withColumn("m_seg", M_udf("Monetary")) rfm_seg.show(5) rfm_seg = rfm_seg.withColumn('RFMScore', F.concat(F.col('r_seg'), F.col('f_seg'), F.col('m_seg'))) rfm_seg.sort(F.col('RFMScore')).show(5) # statistical summary simple_summary = rfm_seg.groupby('RFMScore').agg({"Recency": "mean", "Frequency": "mean", "Monetary": "mean"}).sort( F.col('RFMScore')) # Extension: apply k-means clustering section to do the segmentation from pyspark.ml.linalg import Vectors def transData(df): return df.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['CustomerID', 'rfm']) transformed_df = transData(rfm) # scale the feature matrix from pyspark.ml.feature import MinMaxScaler scaler = MinMaxScaler(inputCol='rfm', outputCol="features") scalerModel = scaler.fit(transformed_df) scaledData = scalerModel.transform(transformed_df) scaledData.show(5, False) # K-means clustering
print 'labeled_data : \n', labeled_data.take(10) #Part 3 ''' Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances. Normalize features between 0 and 1 with 1 representing darkest shade in heat map. Experiment with minmaxscaler : https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler ''' lines = data.map(lambda line: line.split(',')) data_transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:]))) data_labeled_df = sqlContext.createDataFrame(data_transformed, ["label", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") scaler_model = scaler.fit(data_labeled_df.limit(40)) scaled_data = scaler_model.transform(data_labeled_df) print 'Labeled DF : \n', data_labeled_df.show(4) scaled_data.select("features", "scaled_features").show(4) scaled_data.show(1, False) #Select any two features and plot heat map heatmap1 = np.asarray( data_labeled_df.rdd.map( lambda r: (float(r.features[1]), float(r.features[1]))).take(40)) plt.imshow(heatmap1, cmap='gray') plt.show() heatmap2 = np.asarray( scaled_data.rdd.map(lambda r:
timesSvd = [] vocabSizes = [] #change the min count of words in order to change the vocabulary size for minCnt in range(3,19,3): ########## WORD2VEC ########### word2Vec = Word2Vec(vectorSize=vecSize, minCount=minCnt, windowSize=10, inputCol="hashtags", outputCol="result") trainDf = dfW2v.select("hashtags") modelW2v = word2Vec.fit(trainDf) resultW2v = modelW2v.transform(dfW2v) vocabularySize = modelW2v.getVectors().count() print("\n" + str(count) + ".vector size " + str(vecSize) + ", minCount " + str(minCnt) + ", vocabulary_size " + str(vocabularySize)) ###### MINMAXSCALE ######### print("scaling the data.....") data = resultW2v.withColumnRenamed("result","w2vVector") scaler = MinMaxScaler(inputCol="w2vVector", outputCol="scaledFeatures") scalerModel = scaler.fit(data) scaledData = scalerModel.transform(data) tokens = [] for user in scaledData.select("screen_name","scaledFeatures").collect(): tokens.append(user[1]) #PCA print("running PCA from Sklearn....") start = time.time() pcaModel = sklearnPCA(n_components=2) pcaValues = pcaModel.fit_transform(tokens) end = time.time() timePcaSklearn = end - start print("ended PCA from Sklearn....")
# _*_ coding:utf-8 _*_ ''' MinMaxScaler ''' from pyspark.sql import SparkSession from pyspark.ml.feature import MinMaxScaler spark = SparkSession.builder.appName("MinMaxScaler").getOrCreate() paths = "/export/home/ry/spark-2.2.1-bin-hadoop2.7/data/mllib/" dataframe = spark.read.format("libsvm").load( paths + "sample_isotonic_regression_libsvm_data.txt") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(dataframe) scaledData = scalerModel.transform(dataframe) scaledData.show()
outputCol="features") assembled_train = assembler.transform(train_data) assembled_train.select("features", "Class").show(truncate=False) training_set = assembled_train.select("features", "Class") #Split de los datos train_final, test_final = training_set.randomSplit([0.80, 0.20], seed = 13) train_final.describe().show() test_final.describe().show() train_final = train_final.selectExpr("Class as label", "features as features") test_final = test_final.selectExpr("Class as label", "features as features") scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(train_final) scaledTData = scalerModel.transform(train_final) scaledTData = scaledTData.select("label", "scaledFeatures") scaledTData = scaledTData.selectExpr("label as label", "scaledFeatures as features") scalerModel = scaler.fit(test_final) scaledFData = scalerModel.transform(test_final) scaledFData = scaledFData.select("label", "scaledFeatures") scaledFData = scaledFData.selectExpr("label as label", "scaledFeatures as features") #Clasificador 2 nb = NaiveBayes(smoothing=1.3, modelType="multinomial") # train the model model = nb.fit(scaledTData)
selectedCols = ['label', 'features'] + final_data.columns df = df.select(selectedCols) #df.printSchema() # ## Random Forest Classification from pyspark.ml.classification import RandomForestClassifier ### MinMax Scaling from pyspark.ml.feature import MinMaxScaler scaler = MinMaxScaler(inputCol='features', outputCol='scaledfeatures') start_time = time.time() scalermodel = scaler.fit(df) scalerdata = scalermodel.transform(df) end_time = time.time() print("total time taken for Scaling loop in seconds: ", end_time - start_time) train, test = scalerdata.randomSplit([0.8, 0.2]) start_time = time.time() rf = RandomForestClassifier(featuresCol="scaledfeatures", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction") rfModel = rf.fit(train) end_time = time.time() print("total time taken to run rf in seconds: ", end_time - start_time)
from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors features_df = spark.createDataFrame([( 1, Vectors.dense([10.0, 10000.0, 1.0]), ), ( 2, Vectors.dense([20.0, 30000.0, 2.0]), ), ( 3, Vectors.dense([30.0, 40000.0, 3.0]), )], ['id', 'features']) features_df.take(1) feature_scaler = MinMaxScaler(inputCol='features', outputCol='sfeatures') smodel = feature_scaler.fit(features_df) sfeatures_df = smodel.transform(features_df) sfeatures_df.take(1) sfeatures_df.select("features", "sfeatures").show() # 2.3 Stardardize numeric data # In[31]: from pyspark.ml.feature import StandardScaler from pyspark.ml.linalg import Vectors features_df = spark.createDataFrame([( 1, Vectors.dense([10.0, 10000.00, 1.0]), ), (
fittedBucketer = bucketer.fit(contDF) fittedBucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MinMaxScaler minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features") fittedminMax = minMax.fit(scaleDF) fittedminMax.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors
df_features DataFrame[id:bigint, features:vector] df_features.printSchema() # root # |-- id: long (nullable = true) # |-- features: vector (nullable = true) df_features.count() # 3 df_features.show() # +---+------------------+ # | id| features| # +---+------------------+ # | 1|[10.0,10000.0,1.0]| # | 2|[20.0,30000.0,2.0]| # | 3|[30.0,40000.0,3.0]| # +---+------------------+ df_features.take(1) # [Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]))] df_features.take(2) # [Row(id=1, features=DenseVector([10.0, 10000.0, 1.0])), Row(id=2, features=DenseVector([20.0, 30000.0, 2.0]))] featureScaler = MinMaxScaler(inputCol="features", outputCol="sfeatures") smodel = featureScaler.fit(df_features) dfSfeatures.show(10, False) # +---+------------------+----------------------------+ # |id |features |sfeatures | # +---+------------------+----------------------------+ # |1 |[10.0,10000.0,1.0]|[0.0,0.0,0.0] | # |2 |[20.0,30000.0,2.0]|[0.5,0.6666666666666666,0.5]| # |3 |[30.0,40000.0,3.0]|[1.0,1.0,1.0] | # +---+------------------+----------------------------+
labeled_data = msd.map(transform_to_labeled_point) print 'labeled_data : \n', labeled_data.take(5) #Part 3 #Choose two features and generate a heat map for each feature on grey scale and shows variation of each feature across 40 sample instances. #Normalize features between 0 and 1 with 1 representing darkest shade in heat map. #Experiment with minmaxscaler : https://spark.apache.org/docs/latest/ml-features.html#minmaxscaler lines = msd.map(lambda line: line.split(',')) msd_transformed = lines.map(lambda line: (line[0], Vectors.dense(line[1:]))) msd_labeled_df = sqlContext.createDataFrame(msd_transformed, ["label", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features") # Compute summary statistics and generate MinMaxScalerModel for 40 samples scaler_model = scaler.fit(msd_labeled_df.limit(40)) # rescale each feature to range [min, max]. scaled_data = scaler_model.transform(msd_labeled_df) print 'msd_labeled_df : \n', msd_labeled_df.show(2) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaled_data.select("features", "scaled_features").show(2) scaled_data.show(1, False) #Select any two features and plot heat map heatmap1 = np.asarray( msd_labeled_df.rdd.map( lambda r: (float(r.features[1]), float(r.features[1]))).take(40)) plt.imshow(heatmap1, cmap='gray') plt.show()
from pyspark.ml.feature import MinMaxScaler from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("MinMaxScalerExample")\ .getOrCreate() # $example on$ dataFrame = spark.createDataFrame([ (0, Vectors.dense([1.0, 0.1, -1.0]),), (1, Vectors.dense([2.0, 1.1, 1.0]),), (2, Vectors.dense([3.0, 10.1, 3.0]),) ], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") # Compute summary statistics and generate MinMaxScalerModel scalerModel = scaler.fit(dataFrame) # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop()
for column in list(dataset.columns) ] pipeline = Pipeline(stages=indexers) dataset_r = pipeline.fit(dataset).transform(dataset) columnList = [ item[0] for item in dataset_r.dtypes if item[1].startswith('double') ] dataset_numeric = dataset_r.select(columnList) vecAssembler = VectorAssembler(inputCols=list(dataset_numeric.columns), outputCol="features") transformed = vecAssembler.transform(dataset_numeric) scaler = MinMaxScaler(inputCol="features",\ outputCol="scaledFeatures") scalerModel = scaler.fit(transformed.select("features")) df_kmeans = scalerModel.transform(transformed) df_kmeans.show() df_kmeans = df_kmeans.select('scaledFeatures') cost = np.zeros(10) for k in range(2, 10): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures") model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(df_kmeans) fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 10), cost[2:10]) ax.set_xlabel('k') ax.set_ylabel('cost') plt.show()