def oneHotEncodeColumns(df, cols): from pyspark.ml.feature import OneHotEncoder newdf = df for c in cols: onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False) newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf
def events(df,column_name): i = column_name+"I" v = column_name+"V" stringIndexer = StringIndexer(inputCol=column_name, outputCol=i) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(inputCol=i, outputCol=v) encoded = encoder.transform(indexed) return encoded
def oneHotEncoding(self, df, input_col): stringInd = StringIndexer(inputCol=input_col, outputCol="indexed") model = stringInd.fit(df) td = model.transform(df) encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False) final_encoding = encoder.transform(td).select(df.id, 'features').cache() conv_udf = udf(lambda line: Vectors.dense(line).tolist()) final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache() return final_encoding
.getOrCreate() #load data in csv format with header rawData = spark.read.load("./hour.csv",format="csv",header=True) rawData.count()#17379 data=rawData #casual+registered=cnt rawData=rawData.drop("casual","registered")#drop columns rawData=rawData.withColumnRenamed("cnt","label")#rename columns cat_features=rawData.columns[2:10] for col in cat_features: #must give a new column name indexer = StringIndexer(inputCol=col, outputCol=col+"_indexed",handleInvalid='error') indexed = indexer.fit(rawData).transform(rawData) encoder = OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"Vec") rawData = encoder.transform(indexed) #cast columns to float for col in rawData.columns[2:15]: rawData=rawData.withColumn(col,rawData[col].cast(FloatType())) #convert date to date format and extract week day from pyspark.sql.functions import date_format rawData=rawData.withColumn("dteday",rawData["dteday"].cast(DateType())) rawData=rawData.withColumn('dteday', date_format('dteday', 'u')) isweekend=udf(lambda x:1.0 if int(x) > 5 else 0.0,FloatType()) rawData=rawData.withColumn("isWeekend",isweekend("dteday"))#whether it is weekend rawData=rawData.drop("dteday")
c16I = StringIndexer(inputCol="C16", outputCol="iC16", handleInvalid="skip") c18I = StringIndexer(inputCol="C18", outputCol="iC18", handleInvalid="skip") c19I = StringIndexer(inputCol="C19", outputCol="iC19", handleInvalid="skip") c21I = StringIndexer(inputCol="C21", outputCol="iC21", handleInvalid="skip") appcatI = StringIndexer(inputCol="app_category", outputCol="i_app_category", handleInvalid="skip") devtypeI = StringIndexer(inputCol="device_type", outputCol="i_device_type", handleInvalid="skip") sitecatI = StringIndexer(inputCol="site_category", outputCol="i_site_category", handleInvalid="skip") #OneHotEncoder applied after the stringIndexer to form binary vector for each column c1E = OneHotEncoder(inputCol="iC1", outputCol="C1Vector") c15E = OneHotEncoder(inputCol="iC15", outputCol="C15Vector") c16E = OneHotEncoder(inputCol="iC16", outputCol="C16Vector") c18E = OneHotEncoder(inputCol="iC18", outputCol="C18Vector") c19E = OneHotEncoder(inputCol="iC19", outputCol="C19Vector") c21E = OneHotEncoder(inputCol="iC21", outputCol="C21Vector") appcatE = OneHotEncoder(inputCol="i_app_category", outputCol="i_app_category_Vector") devtypeE = OneHotEncoder(inputCol="i_device_type", outputCol="i_device_type_Vector") sitecatE = OneHotEncoder(inputCol="i_site_category", outputCol="i_site_category_Vector") #Vector assembler fAssembler = VectorAssembler(inputCols=[ "C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector",
cols = [ 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked' ] data_cols = data.select(cols) data_cols.show() final_data = data_cols.na.drop() # Transform the categorical columns into numbers gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex') # A B C # 0 1 2 # One hot encode ----> this is mapping everyting into [1, 0, 0] [0, 1, 0] etc. gender_encoder = OneHotEncoder( inputCol='SexIndex', outputCol='SexVec' ) # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1] embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex') embark_encoder = OneHotEncoder( inputCol='EmbarkedIndex', outputCol='EmbarkedVec' ) # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1] new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'] assembler = VectorAssembler(inputCols=new_cols, outputCol='features') logreg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived') pipeline = Pipeline(stages=[ gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler,
data = data.filter(lambda row: row != header) schema = data.map(lambda x: Row(id=x[0], make=x[1], vdps=x[2], label=x[3])) df = sqlContext.createDataFrame(schema) # string indexer for our categorical features # this indexes each categorical feature and we will # save them in a data frame that maps the make name to the string # for persistence purposes indexer = StringIndexer(inputCol="make", outputCol="makeIDX") df = indexer.fit(df).transform(df) make_idx_mappings = df.select('make', 'makeIDX').distinct().show() # one hot encoder # this will convert the indexed strings to sparse one hot vectors # think of this as dummy feature creation encoder = OneHotEncoder(inputCol="makeIDX", outputCol="make_sparse_vect") df = encoder.transform(df) # spark models expect to see a feature vector and a prediction column # so we need to put all our features into a vector, in this case # the sparse vector and vdp count, we also have to do some # data type transformations from string to double df = df.withColumn("vdp_int", df["vdps"].cast("double")) df = df.withColumn("label_int", df["label"].cast("double")) assembler = VectorAssembler(inputCols=["make_sparse_vect", "vdp_int"], outputCol='features') df = assembler.transform(df) # make the model # the step size and iterations is touchy so results might be funky gbt = GBTRegressor(maxIter=100,
mmi_value_0_node4 = ["Sex", "Embarked", "Survived"] mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"] stages_node4 = [] for i in range(len(mmi_value_0_node4)): stages_node4.append( StringIndexer(inputCol=mmi_value_0_node4[i], outputCol=mmi_value_1_node4[i], handleInvalid="error", stringOrderType="frequencyDesc")) mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"] mmi_value_1_node5 = ['sexVec', 'embarkedVec'] stages_node5 = [] for i in range(len(mmi_value_0_node5)): stages_node5.append( OneHotEncoder(inputCol=mmi_value_0_node5[i], outputCol=mmi_value_1_node5[i])) pipeline_stage_node6 = VectorAssembler( outputCol="features", inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"]) pipeline_stage_node7 = RandomForestClassifier(featureSubsetStrategy="auto", numTrees=20, maxDepth=5, predictionCol="prediction", rawPredictionCol="rawPrediction", probabilityCol="probability", labelCol="indexedSurvived", featuresCol="features", impurity="gini") stages_node8 = [
# create a new "carrier_indexed" column (indexed_df.select(["origin", "dest", "carrier", "carrier_indexed"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show()) # check the encoded carrier values carrierIndexer.labels # check the carrier code and index mapping indexed_df.select(["carrier", "carrier_indexed"]).distinct().show() carrierEncoder = OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName)) encoded_df = carrierEncoder.transform(indexed_df) (encoded_df.select(["origin", "dest", "carrier", "carrier_indexed", "carrier_encoded"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show()) carrierEncoder = OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName), dropLast=False) encoded_df = carrierEncoder.transform(indexed_df) (encoded_df.select(["carrier", "carrier_indexed", "carrier_encoded", "dist"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show()) #Combine StringIndexer, OneHotEncoder, VectorAssembler and a Transformer to put features into a feature vector column
for col in ordinals ], VectorAssembler(inputCols=ordinals_input, outputCol='ordinals_vector'), StandardScaler(inputCol='ordinals_vector', outputCol='ordinals_std', withStd=True, withMean=True), # categoricals *[ StringIndexer( inputCol=col, outputCol=col + "_index", handleInvalid='keep') for col in categoricals ], *[ OneHotEncoder( inputCol=col + "_index", outputCol=col + "_encode", dropLast=True) for col in categoricals ], VectorAssembler(inputCols=categoricals_input, outputCol='categoricals_vector'), StandardScaler(inputCol='categoricals_vector', outputCol='categoricals_std', withStd=True, withMean=True), # final assembler VectorAssembler(inputCols=stdFeatures, outputCol='features_std'), #PCA PCA(k=75, inputCol='features_std', outputCol='features_final') ]
total = train.union(val).union(test) # create features indexer = StringIndexer(inputCol="_c12", outputCol="c22") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # create label indexer = StringIndexer(inputCol="_c11", outputCol="label") indexer = indexer.fit(total) train = indexer.transform(train) val = indexer.transform(val) test = indexer.transform(test) # One-hot encoder encoder = OneHotEncoder(inputCol="c22", outputCol="c2") train = encoder.transform(train) val = encoder.transform(val) test = encoder.transform(test) # create the trainer and set its parameters with open('H1_15300180012_output.txt', 'a') as f: f.write('\n \n') f.write('jq_H1_15300180012_output_naive_bayes\n') para = 1.0 with open('H1_15300180012_output.txt', 'a') as f: f.write('Smoothing parameter: {} \n'.format(para)) nb = NaiveBayes(smoothing=para, modelType="multinomial", labelCol="label", featuresCol="c2") # train the model model = nb.fit(train)
df = spark.read.csv('home_data.csv', header=True) df = df.withColumn("price", df["price"].cast(DoubleType()))\ .withColumn("sqft_living", df["sqft_living"].cast(DoubleType())) print(df.columns) # Get training sets (trainData, testData) = df.randomSplit(seed=123, weights=[0.7,0.3]) print("The total data is {}, the training is {} and the test is {}"\ .format(df.count(), trainData.count(), testData.count())) #Train & Evaluate stringifier = StringIndexer(inputCol="zipcode", outputCol="zipIndex") oneHotter = OneHotEncoder(inputCol="zipIndex", outputCol="zipVector") vectorizer = VectorAssembler(inputCols=["sqft_living", "zipVector"], outputCol="features") glr = GeneralizedLinearRegression(labelCol="price", family="gaussian", link="identity", maxIter=10, regParam=0.3) rf = RandomForestRegressor(labelCol="price", seed=1234) rfAdv = RandomForestRegressor(labelCol="price", seed=1234, numTrees=500, maxDepth=10, maxBins=100, minInstancesPerNode=5, featureSubsetStrategy="all") for alg in [(glr, "Linear Regression"), (rf, "Random Forest (Default)"), (rfAdv, "Random Forest (Advanced)")]: print("+++++%s Results+++++" % (alg[1])) simplePipeline = Pipeline(stages=[stringifier, oneHotter, vectorizer, alg[0]]) model = simplePipeline.fit(trainData) #Print Reslts #testingData = vectorizer.transform(testData) # Make predictions. predictions = model.transform(testData)
store = Store.create(args.work_dir) # Download MNIST dataset data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2' libsvm_path = os.path.join(args.data_dir, 'mnist.bz2') if not os.path.exists(libsvm_path): subprocess.check_output(['wget', data_url, '-O', libsvm_path]) # Load dataset into a Spark DataFrame df = spark.read.format('libsvm') \ .option('numFeatures', '784') \ .load(libsvm_path) # One-hot encode labels into SparseVectors encoder = OneHotEncoder(inputCols=['label'], outputCols=['label_vec'], dropLast=False) model = encoder.fit(df) train_df = model.transform(df) # Train/test split train_df, test_df = train_df.randomSplit([0.9, 0.1]) # Disable GPUs when building the model to prevent memory leaks if LooseVersion(tf.__version__) >= LooseVersion('2.0.0'): # See https://github.com/tensorflow/tensorflow/issues/33168 os.environ['CUDA_VISIBLE_DEVICES'] = '-1' else: keras.backend.set_session( tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})))
def one_hot_encoder(self, input_cols): output_cols = [each_col + "_vec" for each_col in input_cols] return OneHotEncoder(inputCols=input_cols, outputCols=output_cols), output_cols
# one-hot encoding #============================================================================== from pyspark.ml.feature import OneHotEncoder # build indexer categorical_columns = [ 'term', 'emp_length', 'home_ownership', 'purpose', 'state' ] stringindexer_stages = [ StringIndexer(inputCol=c, outputCol='stringindexed_' + c) for c in categorical_columns ] onehotencoder_stages = [ OneHotEncoder(inputCol='stringindexed_' + c, outputCol='onehotencoded_' + c) for c in categorical_columns ] all_stages_transf1 = stringindexer_stages + onehotencoder_stages ## build pipeline model for transformation of Data from pyspark.ml import Pipeline pipeline = Pipeline(stages=all_stages_transf1) ## fit pipeline model pipeline_mode = pipeline.fit(loan_df_droped) ## transform data df_coded = pipeline_mode.transform(loan_df_droped) ## remove uncoded columns selected_columns = ['onehotencoded_' + c for c in categorical_columns] + [
# | 24| M| 1| technician| 85711| # | 53| F| 2| other| 94043| # | 23| M| 3| writer| 32067| # | 24| M| 4| technician| 43537| # | 33| F| 5| other| 15213| # | 42| M| 6| executive| 98101| # | 57| M| 7|administrator| 91344| # | 36| M| 8|administrator| 05201| # | 29| M| 9| student| 01002| # | 53| M| 10| lawyer| 90703| # | 39| F| 11| other| 30329| indexer = StringIndexer(inputCol="occupations", outputCol="occupationsIndex",handleInvalid='error') indexed=indexer.fit(users).transform(users) #transfer dataframe to rdd by ".rdd" all_occupations = set(indexed.select("occupations","occupationsIndex").rdd.map(lambda x:(x[0],x[1])).collect()) encoder = OneHotEncoder(inputCol="occupationsIndex", outputCol="occupationsVec") encoded = encoder.transform(indexed) encoded.select("occupations","occupationsVec").show() # +-------------+---------------+ # | occupations| occupationsVec| # +-------------+---------------+ # | technician|(20,[11],[1.0])| # | other| (20,[1],[1.0])| # | writer| (20,[7],[1.0])| # | technician|(20,[11],[1.0])| # | other| (20,[1],[1.0])| # | executive| (20,[8],[1.0])| # |administrator| (20,[3],[1.0])| # |administrator| (20,[3],[1.0])| # | student| (20,[0],[1.0])|
# $example on$ from pyspark.ml.feature import OneHotEncoder # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("OneHotEncoderExample")\ .getOrCreate() # Note: categorical features are usually first encoded with StringIndexer # $example on$ df = spark.createDataFrame([ (0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0), (0.0, 1.0), (2.0, 0.0) ], ["categoryIndex1", "categoryIndex2"]) encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"], outputCols=["categoryVec1", "categoryVec2"]) model = encoder.fit(df) encoded = model.transform(df) encoded.show() # $example off$ spark.stop()
# test = test.drop("AGEImputed") # train = train.drop("AGEImputed") test.columns, train.columns print("Number of training records: " + str(train.count())) print("Number of testing records : " + str(test.count())) # one-hot encoding 'pclass', 'embarked', 'sex' from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline, Model x = StringIndexer(inputCol = "pclass", outputCol = "pclass_1").fit(train).transform(train) OneHotEncoder(inputCol = "pclass_1", outputCol = "pclass_2").transform(x).show(10) # demonstrating the work of OneHotEncoder with StringIndexer si_pclass = StringIndexer(inputCol = "pclass", outputCol = "pclass_1") si_embarked = StringIndexer(inputCol = "embarked", outputCol = "embarked_1") si_survived = StringIndexer(inputCol = "survived", outputCol = "survived_1") si_sex = StringIndexer(inputCol = "sex", outputCol = "sex_1") vectorAssembler_features = VectorAssembler(inputCols = ["pclass_1", "embarked_1", "sex_1", "survived_1"], outputCol = "features") rf = RandomForestRegressor(labelCol = "age", featuresCol = "features") pipeline_rf = Pipeline(stages = [si_pclass, si_embarked, si_survived, si_sex, vectorAssembler_features, rf]) train.printSchema() pipeline_rf.fit(train)
False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'], False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'], False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True, False if r.attributes['Take-out'] is None else r.attributes['Take-out']] ).toDF(clustering_columns) # drop row with null values lv_clustering_data = lv_clustering_data.dropna() #Neighborhood feature engineering stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index") lv_model = stringIndexer.fit(lv_clustering_data) lv_indexed = lv_model.transform(lv_clustering_data) encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec") lv_encoded = encoder.transform(lv_indexed) #initial feature set # assembler = VectorAssembler( # inputCols=["stars", "price_range", "neigh_vec"], # outputCol="features_vec") #expanded feature set feature_columns = clustering_columns[2:] feature_columns.append("neigh_vec") assembler = VectorAssembler( inputCols=feature_columns, outputCol="features_vec") lv_assembled = assembler.transform(lv_encoded)
# different brands of chess sets, and use the column named # `set` specifying which set each piece is from as one of # the features. chess = spark.table('chess.four_chess_sets') # Use `StringIndexer` to convert `set` from string codes to # numeric codes indexer = StringIndexer(inputCol="set", outputCol="set_ix") indexer_model = indexer.fit(chess) list(enumerate(indexer_model.labels)) indexed = indexer_model.transform(chess) # Depending on the model, we might also need to apply another # like the `OneHotEncoder` to generate a set of dummy variables encoder = OneHotEncoder(inputCol="set_ix", outputCol="set_cd") encoded = encoder.transform(indexed) selected = encoded.select('base_diameter', 'height', 'set_cd', 'weight') feature_columns = ['base_diameter', 'height', 'set_cd'] # We must assemble the features into a single column of vectors: assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") assembled = assembler.transform(selected) (train, test) = assembled.randomSplit([0.8, 0.2]) lr = RandomForestRegressor(featuresCol="features", labelCol="weight") lr_model = lr.fit(train) test_with_predictions = lr_model.transform(test)
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler from pyspark.ml import Pipeline categoricalColumns = ["UNIQUE_CARRIER", "ORIGIN", "DEST"] numericalColumns = ["DISTANCE"] # Convert string categorical columns to indexed integers indexers = [ StringIndexer(inputCol=c, outputCol ="{0}_indexed".format(c)) for c in categoricalColumns ] # OneHot Encoding encoders = [ OneHotEncoder( inputCol=indexer.getOutputCol(), outputCol ="{0}_encoded".format(indexer.getOutputCol()) ) for indexer in indexers ] # Assembler for categorical columns assemblerCategorical = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol= "cat") stages = indexers+encoders+ [assemblerCategorical] pipelineCategorical = Pipeline(stages=stages) df = pipelineCategorical.fit(df).transform(df) # Assembler for Numerical columns assemblerNumerical = VectorAssembler(inputCols = numericalColumns, outputCol = "num") pipelineNumerical = Pipeline(stages = [assemblerNumerical]) df = pipelineNumerical.fit(df).transform(df)
df_0 = df_pandas[df_pandas['Harm'] == 0] df_1 = df_pandas[df_pandas['Harm'] == 1] df_2 = df_pandas[df_pandas['Harm'] == 2] df_1_new = df_1.sample(frac=1.64, replace=True) df_2_new = df_2.sample(frac=4.3, replace=True) df_pandas = pd.concat([df_0, df_1_new, df_2_new], ignore_index=True) df_pandas.sample(frac=1) print(df_pandas['Harm'].value_counts()) df = spark.createDataFrame(df_pandas) df.toPandas() # In[8]: # Preparing for machine learning cbwd_Indexer = StringIndexer(inputCol='cbwd', outputCol='cbwdIndex') cbwd_encoder = OneHotEncoder(inputCol='cbwdIndex', outputCol='cbwdVec') Harm_Indexer = StringIndexer(inputCol='Harm', outputCol='label') assembler = VectorAssembler( inputCols=['DEWP', 'HUMI', 'PRES', 'cbwdVec', 'TEMP', 'Iws'], outputCol="features") # In[9]: # Pipeline pipeline = Pipeline( stages=[cbwd_Indexer, Harm_Indexer, cbwd_encoder, assembler]) pipeline_model = pipeline.fit(df) pipe_df = pipeline_model.transform(df) pipe_df = pipe_df.select('label', 'features')
# The following code does three things with pipeline: # # * **`StringIndexer`** all categorical columns # * **`OneHotEncoder`** all categorical index columns # * **`VectorAssembler`** all feature columns into one vector column # ### Categorical columns from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml import Pipeline import pyspark.sql.functions as F # categorical columns categorical_columns = cuse.columns[0:3] stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in categorical_columns] stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in categorical_columns] ppl = Pipeline(stages=stage_string + stage_one_hot) df = ppl.fit(cuse).transform(cuse) df.toPandas().to_csv('cuse_afterTransform.csv') df.select("age", 'age_string_encoded').distinct().sort(F.asc("age_string_encoded")).show() df.select("education").distinct().show() df.select("wantsMore").distinct().show() # In[2]: # ### Build VectorAssembler stage df.columns assembler = VectorAssembler( inputCols=['age_one_hot', 'education_one_hot',
# # Let's break this down into multiple steps to make it all clear. # In[12]: from pyspark.ml.feature import (VectorAssembler,VectorIndexer, OneHotEncoder,StringIndexer) # In[13]: #indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df).transform(df) for column in df.columns ] gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex') gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec') # In[14]: embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex') embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec') # In[15]: assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age',
def oneHot(df, base_col_name, col_name): from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession import os import time #os.environ['SPARK_HOME'] = '/root/spark-2.1.1-bin' sparkConf = SparkConf() \ .setAppName('pyspark rentmodel') \ .setMaster('local[*]') sc = SparkContext.getOrCreate(sparkConf) sc.setLogLevel('WARN') spark = SparkSession(sparkContext=sc) df = df.select(base_col_name, col_name) df = df.filter(df[base_col_name].isNotNull()) # StringIndexer'handleInvalid of python'version no have 'keep',so it can't process null value null_col_name = col_name + '_null' df = df.na.fill(null_col_name, col_name) df_NULL = df.filter(df[col_name] == 'NULL') df = df.filter(df[col_name].isNotNull()) df = df.filter(df[col_name] != '') print('one-hot=======', col_name, df.count()) temp_path = '/data/20180621/ALL_58_beijing_save_models/' if df_NULL.count() > 0: def udf_NULL(s): return null_col_name udf_transf = udf(udf_NULL) df_NULL = df_NULL.select('*', udf_transf(col_name).alias('tmp_col_name')) df_NULL = df_NULL.na.fill(null_col_name, 'tmp_col_name') df_NULL = df_NULL.drop(col_name) df_NULL = df_NULL.withColumnRenamed('tmp_col_name', col_name) df_no_NULL = df.filter(df[col_name] != 'NULL') df_no_NULL = df_no_NULL.withColumn('tmp_col_name', df[col_name]) df_no_NULL = df_no_NULL.drop(col_name) df_no_NULL = df_no_NULL.withColumnRenamed('tmp_col_name', col_name) df = df_no_NULL.union(df_NULL) del df_no_NULL index_name = col_name + 'Index' vector_name = col_name + 'Vec' """ StringIndexer可以设置handleInvalid='skip',但是不可以设置handleInvalid='keep'. 设置这个会删除需要跳过的这一行,这样会导致用户体验差,因为用户输入 一条数据,就直接给删了,什么都没有。因此暂不设置,新数据输入时,如果没有, 可以在已经有的字符串中随机选择一个来替换没有的这个新字符串. """ stringIndexer = StringIndexer(inputCol=col_name, outputCol=index_name) model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(dropLast=False, inputCol=index_name, outputCol=vector_name) encoded = encoder.transform(indexed) #save stringIndexer.save(temp_path + 'stringIndexer' + col_name) model.save(temp_path + 'stringIndexer_model' + col_name) # StringIndexer(inputCol=col_name, outputCol=index_name) # onehotEncoderPath = temp_path + col_name # loadedEncoder = OneHotEncoder.load(onehotEncoderPath) # loadedEncoder.setParams(inputCol=index_name, outputCol=vector_name) # encoded = loadedEncoder.transform(df) # encoded.show() onehotEncoderPath = temp_path + col_name + '_new' encoder.save(onehotEncoderPath) sub_encoded = encoded.select(base_col_name, vector_name) return sub_encoded
from pyspark.ml.feature import Imputer df3 = Imputer(inputCols=['Age','Fare'], outputCols=['Age1','Fare1']).fit(df3).transform(df3) df3.show(3) #-------------------------------- # df3 = df2.select('Sex',df2.Pclass.cast('double'),df2.Survived.cast('double'),'Embarked',df2.Fare.cast('double'),df2.Age.cast('double')) from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df3).transform(df3) df3.show() df3 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df3) df3.show() # -------------------------------------------- df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3) df3 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df3) df3.show(5) # cast to double #df3 = df3.select(df3.Pclass.cast('double'),df3.Gender1,df3.Embarked2,df3.Survived.cast('double')) #df3.printSchema() # Vector assembler
# COMMAND ---------- # MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices. # COMMAND ---------- categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label") stages += [label_stringIdx] # COMMAND ---------- # MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns. # COMMAND ---------- # Transform all features into a vector using VectorAssembler numericCols = [
df.createOrReplaceTempView("df") spark.sql("SELECT * from df").show() # Please create a VectorAssembler which consumed columns X, Y and Z and produces a column “features” # # In[31]: from pyspark.ml.feature import StringIndexer from pyspark.ml.feature import OneHotEncoder from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import Normalizer indexer = StringIndexer(inputCol="CLASS", outputCol="label") encoder = OneHotEncoder(inputCol="label", outputCol="labelVec") vectorAssembler = VectorAssembler(inputCols=["X", "Y", "Z"], outputCol="features") normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0) # Please insatiate a classifier from the SparkML package and assign it to the classifier variable. Make sure to either # 1. Rename the “CLASS” column to “label” or # 2. Specify the label-column correctly to be “CLASS” # # In[55]: # LogisticReg accuracy was 54.7% Naive Bayes requires non-negative data # from pyspark.ml.classification import LogisticRegression # classifier = LogisticRegression(maxIter=200, regParam=0.2, elasticNetParam=0.8)
def make_regr_model(data, sc, model_path, model_name, target, ml_model='default', save=True): t0 = time() # Stages for pipline stages = [] # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Identify categorical and numerical variables catCols = [x for (x, dataType) in trainingData.dtypes if ((dataType == "string") | (dataType == "boolean"))] numCols = [x for (x, dataType) in trainingData.dtypes if (((dataType == "int") | (dataType == "bigint") | (dataType == "float") | (dataType == "double")) & (x != "target"))] # OneHotEncode categorical variables indexers = [StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols] encoder = OneHotEncoder( inputCols=[indexer.getOutputCol() for indexer in indexers], outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers] ) assembler_cat = VectorAssembler( inputCols=encoder.getOutputCols(), outputCol="categorical-features", handleInvalid="skip" ) stages += indexers stages += [encoder, assembler_cat] assembler_num = VectorAssembler( inputCols=numCols, outputCol="numerical-features", handleInvalid="skip" ) # Standardize numerical variables scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled") # Combine all features in one vector assembler_all = VectorAssembler( inputCols=['categorical-features', 'numerical-features_scaled'], outputCol='features', handleInvalid="skip" ) stages += [assembler_num, scaler, assembler_all] # Train a RandomForest model. if ml_model == 'default': rf = RandomForestRegressor(labelCol="target", featuresCol="features") else: rf = ml_model stages += [rf] # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=stages) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. #predictions.select("prediction", "target", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="target", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("RMSE = %g" % (0.0 + rmse)) if save: # Final model saving and statistics writing tt = time() - t0 timestamp = int(time()) model.write().overwrite().save(model_path) cluster = Cluster(['127.0.0.1'], "9042") session = cluster.connect("models") query = ("INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)") % ("models_statistics") query = query + " VALUES (%s, %s, %s, %s, %s, %s)" session.execute(query, (model_name, timestamp, target, tt, model_path, rmse)) session.shutdown() cluster.shutdown() # Stop spark session sc.stop() if not save: return model, sc
# StringIndexer from pyspark.ml.feature import StringIndexer categeriesIndexer = StringIndexer(inputCol="job",outputCol="job_index") categeriesTransformer = categeriesIndexer.fit(df) print(categeriesTransformer.labels) # OneHot encoder from pyspark.ml.feature import OneHotEncoder from pyspark.ml.feature import VectorAssembler onehotencoder = OneHotEncoder(inputCol='categoryIndex', outputCol='categoryVec') oncoded = onehotencoder.transform(indexed) assember = VectorAssembler(inputCol=["age","user_id"],outputCol="features") from pyspark.ml.classificaton import DecisionTreeClassifier dt_clf = DecisionTreeClassifier(labelCol="label",featureCol="feature",impurity="gini",maxDepth=5,maxBins=5) model = dt_clf.fit(df) from pyspark.ml import Pipeline pipeline = Pipeline(stages=[categeriesIndexer,onehotencoder,assember,dt_clf]) pipeline.getStages() pipeModel = pipeline.fit(train_df) predicted = pipeModel.transform(test_df) print(predicted.columns)
numeric_col = ["qty_reference"] imputer = Imputer(inputCols=numeric_col, outputCols=["{}_imputed".format(c) for c in numeric_col]) categorical_col = ["SITE_FORMAT", "season"] indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexedd".format(c), handleInvalid='skip') for c in categorical_col ] encoders = [ OneHotEncoder(dropLast=True, inputCol=indexer.getOutputCol(), outputCol="{0}_encodedd".format(indexer.getOutputCol())) for indexer in indexers ] assembler = VectorAssembler(inputCols= [encoder.getOutputCol() for encoder in encoders] + \ [x +'_imputed' for x in numeric_col] + ['day', 'month', 'weekday', 'weekend', 'monthend', 'monthbegin', 'monthquarter', 'yearquarter'], outputCol="Features") pca = PCA(k=5, inputCol="Features", outputCol="pcaFeatures") pipeline = Pipeline(stages = [dex, mex, yex, wdex, wex, meex, vex, mbex, mqex, yqex, ydex] + \ [imputer] + \ indexers + \ encoders + \ [assembler]+ \
def build_indep_vars(df, independent_vars, categorical_vars=None, keep_intermediate=False, summarizer=True): """ Data verification df : DataFrame independent_vars : List of column names categorical_vars : None or list of column names, e.g. ['col1', 'col2'] """ assert ( type(df) is pyspark.sql.dataframe.DataFrame ), 'pypark_glm: A pySpark dataframe is required as the first argument.' assert ( type(independent_vars) is list ), 'pyspark_glm: List of independent variable column names must be the third argument.' for iv in independent_vars: assert ( type(iv) is str ), 'pyspark_glm: Independent variables must be column name strings.' assert ( iv in df.columns ), 'pyspark_glm: Independent variable name is not a dataframe column.' if categorical_vars: for cv in categorical_vars: assert ( type(cv) is str ), 'pyspark_glm: Categorical variables must be column name strings.' assert ( cv in df.columns ), 'pyspark_glm: Categorical variable name is not a dataframe column.' assert ( cv in independent_vars ), 'pyspark_glm: Categorical variables must be independent variables.' """ Code """ from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler from pyspark.ml.regression import GeneralizedLinearRegression if categorical_vars: string_indexer = [ StringIndexer(inputCol=x, outputCol='{}_index'.format(x)) for x in categorical_vars ] encoder = [ OneHotEncoder(dropLast=True, inputCol='{}_index'.format(x), outputCol='{}_vector'.format(x)) for x in categorical_vars ] independent_vars = [ '{}_vector'.format(x) if x in categorical_vars else x for x in independent_vars ] else: string_indexer, encoder = [], [] assembler = VectorAssembler(inputCols=independent_vars, outputCol='indep_vars') pipeline = Pipeline(stages=string_indexer + encoder + [assembler]) model = pipeline.fit(df) df = model.transform(df) #for building the crosswalk between indicies and column names if summarizer: param_crosswalk = {} i = 0 for x in independent_vars: if '_vector' in x[-7:]: xrs = x.rstrip('_vector') dst = df[[xrs, '{}_index'.format(xrs)]].distinct().collect() for row in dst: param_crosswalk[int(row['{}_index'.format(xrs)] + i)] = row[xrs] maxind = max(param_crosswalk.keys()) del param_crosswalk[maxind] #for droplast i += len(dst) elif '_index' in x[:-6]: pass else: param_crosswalk[i] = x i += 1 """ {0: 'carat', 1: u'SI1', 2: u'VS2', 3: u'SI2', 4: u'VS1', 5: u'VVS2', 6: u'VVS1', 7: u'IF'} """ make_summary = Summarizer(param_crosswalk) if not keep_intermediate: fcols = [ c for c in df.columns if '_index' not in c[-6:] and '_vector' not in c[-7:] ] df = df[fcols] if summarizer: return df, make_summary else: return df
# In[101]: df_model=df_ORG # stringIndexer1 = StringIndexer(inputCol="Origin", outputCol="originIndex") # model_stringIndexer = stringIndexer1.fit(df_model) # indexedOrigin = model_stringIndexer.transform(df_model) # encoder1 = OneHotEncoder(dropLast=False, inputCol="originIndex", outputCol="originVec") # df_model = encoder1.transform(indexedOrigin) # In[ ]: stringIndexer2 = StringIndexer(inputCol="Dest", outputCol="destIndex") model_stringIndexer = stringIndexer2.fit(df_model) indexedDest = model_stringIndexer.transform(df_model) encoder2 = OneHotEncoder(dropLast=False, inputCol="destIndex", outputCol="destVec") df_model = encoder2.transform(indexedDest) # We use __labeled point__ to make local vectors associated with a label/response. In MLlib, labeled points are used in supervised learning algorithms and they are stored as doubles. For binary classification, a label should be either 0 (negative) or 1 (positive). # In[105]: assembler = VectorAssembler( inputCols = ['Year','Month','DayofMonth','DayOfWeek','Hour','Distance','destVec'], outputCol = "features") output = assembler.transform(df_model) airlineRDD=output.map(lambda row: LabeledPoint([0,1][row['DepDelayed']],row['features'])) # ### Preprocessing: Spliting dataset into train and test dtasets
WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon" WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush" END as TrafficTimeBins FROM taxi_test """ taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement) ## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY taxi_df_test_with_newFeatures.cache() taxi_df_test_with_newFeatures.count() ## INDEX AND ONE-HOT ENCODING stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex") model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above indexed = model.transform(taxi_df_test_with_newFeatures) encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec") encoded1 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex") model = stringIndexer.fit(encoded1) indexed = model.transform(encoded1) encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec") encoded2 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex") model = stringIndexer.fit(encoded2) indexed = model.transform(encoded2) encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec") encoded3 = encoder.transform(indexed) stringIndexer = StringIndexer(inputCol="TrafficTimeBins", outputCol="TrafficTimeBinsIndex")
(Vectors.dense(2, 5, 6),2), (Vectors.dense(1, 8, 9),3) ]).toDF("features", "label") indxr = VectorIndexer()\ .setInputCol("features")\ .setOutputCol("idxed")\ .setMaxCategories(2) indxr.fit(idxIn).transform(idxIn).show() # COMMAND ---------- from pyspark.ml.feature import OneHotEncoder, StringIndexer lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd") colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color")) ohe = OneHotEncoder().setInputCol("colorInd") ohe.transform(colorLab).show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\
def initialize(self, do_scaling=True, do_onehot=True): """Reads the dataset, initializes class members. features_df: Original DataFrame as read from the features_file. train_df: A DataFrame with columns Lat, Lon, Pickup_Count and vector columns Features & ScaledFeatures. Contains only data before 2015. test_df: As train_df, but only containing data of 2015. districts_with_counts: A DataFrame with all districts and their counts. """ # Read feature dataframe self.features_df = self.sql_context.read.parquet(self.features_file).cache() # Set exclude columns to default exclude_columns = self.EXCLUDE_COLUMNS # Scale features if do_scaling: assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS, outputCol='FeaturesToScale') self.features_df = assembler.transform(self.features_df) scaler = StandardScaler(inputCol='FeaturesToScale', outputCol=('ScaledFeatures'), withStd=True, withMean=False) self.features_df = scaler.fit(self.features_df).transform(self.features_df) exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale'] # Adopt categorical features that do not have a value range of [0, numCategories) for column in ['Day', 'Month', 'Day_Of_Year']: if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1) # Encode categorical features using one-hot encoding if do_onehot: vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS] for i in range(len(self.ONE_HOT_COLUMNS)): column = self.ONE_HOT_COLUMNS[i] if column in self.features_df.columns: self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType())) encoder = OneHotEncoder(inputCol=column, outputCol=vec_category_columns[i], dropLast=False) self.features_df = encoder.transform(self.features_df) exclude_columns += self.ONE_HOT_COLUMNS # Vectorize features feature_columns = [column for column in self.features_df.columns if column not in exclude_columns] assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features') self.features_df = assembler.transform(self.features_df) # Set number of distinct values for categorical features (identified by index) self.categorical_features_info = {} if not do_onehot: self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]] for i in range(len(feature_columns)) if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()} # Split into train and test data split_date = datetime(2015, 1, 1) self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache() self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache() # Compute Districts with counts self.districts_with_counts = self.features_df \ .groupBy([self.features_df.Lat, self.features_df.Lon]) \ .count()
#Encoding Categorical Variable from pyspark.sql.functions import * categoricalCol=["SEX","MARRIAGE","AGE","EDUCATION","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"] from pyspark.ml.feature import OneHotEncoder, StringIndexer from pyspark.ml import Pipeline for c in categoricalCol: str1=c+"_Index" str2=c+"_Vec" stringIndexer=StringIndexer().setInputCol(c).setOutputCol(str1) model = stringIndexer.fit(trans_df3) indexed = model.transform(trans_df3) encoder= OneHotEncoder().setInputCol(str1).setOutputCol(str2) trans_df3=encoder.transform(indexed) trans_df3.show(3) from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler #assembler = VectorAssembler( # inputCols=["LIMIT_BAL", "SEX", "EDUCATION","MARRIAGE","AGE"], # outputCol="features") assembler=VectorAssembler(inputCols=["LIMIT_BAL","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6", \ "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \ "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features")
def transform(df, spark, sql_query = None, numerical_features = [], categorical_features = [],\ normalize = True, normalize_p=2): # Apply SQL query if sql_query != None: df.createOrReplaceTempView("netlytics") # Execute Query result_df = spark.sql(sql_query) df = result_df # Transform Strings in OneHot schema = df.schema feat_to_type = {} for struct in schema: feat_to_type[struct.name] = str(struct.dataType) for feature in categorical_features: # Replaces None k = col(feature) df = df.withColumn(feature, when(k.isNull(), "__NA__").otherwise(k)) stringIndexer = StringIndexer(inputCol=feature, outputCol=feature + "_indexed", handleInvalid="skip") model = stringIndexer.fit(df) df = model.transform(df) encoder = OneHotEncoder(inputCol=feature + "_indexed", outputCol=feature + "_encoded") df = encoder.transform(df) # Extract Features def extract_features(row, numerical_features, feat_to_type): output_features = {} fields = list(row.asDict().keys()) for field in fields: if field in numerical_features and feat_to_type[ field] != "StringType": output_features[field] = float(row[field]) if field.endswith("_encoded"): output_list = list(row[field]) for i, v in enumerate(output_list): tmp_field = field + "_" + str(i) output_features[tmp_field] = float(v) features = [ v for k, v in sorted(output_features.items(), key=operator.itemgetter(0)) ] old_dict = row.asDict() old_dict["features"] = DenseVector(features) new_row = Row(**old_dict) return new_row #spark = df.rdd. rdd = df.rdd.map( lambda row: extract_features(row, numerical_features, feat_to_type)) df = spark.createDataFrame(rdd, samplingRatio=1, verifySchema=False) # Normalize if normalize: normalizer = Normalizer(inputCol="features", outputCol="featuresNorm", p=normalize_p) df = normalizer.transform(df) df = df.drop("features") df = df.withColumnRenamed("featuresNorm", "features") # Delete intermediate columns: schema = df.schema feat_to_type = {} for struct in schema: feat_to_type[struct.name] = str(struct.dataType) for feature in feat_to_type: if feat_to_type[feature] != "StringType": if feature.endswith("_encoded") or feature.endswith("_indexed"): df = df.drop(feature) return df
# MAGIC # MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector). # COMMAND ---------- ###One-Hot Encoding from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") model = stringIndexer.fit(dataset) indexed = model.transform(dataset) # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") encoded = encoder.transform(indexed) dataset = encoded print dataset.take(1) # COMMAND ---------- # MAGIC %md # MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row. # COMMAND ---------- # MAGIC %md # MAGIC We use the StringIndexer() again here to encode our labels to label indices
"Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"] # Soil_Type7,"Soil_Type8", Soil_Type15, Soil_Type25 : always the same value for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors encoder = OneHotEncoder(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Numerical columns : create vecAssembler numericCols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4"] # Transform all features into a vector using VectorAssembler assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [vecAssembler] # Split existing trainingData into training and test sets (30% held out for testing) (training, test) = trainingData.randomSplit([0.7, 0.3], seed=1234)