# Use the new indexed field to obtain a one-hot-encoded field # In[15]: from pyspark.ml.feature import OneHotEncoder encodedDF = OneHotEncoder(inputCol="WorkClass_index", outputCol="WorkClass_encoded").transform(indexedDF) # #### A WorkClass_encoded field is created # * This contains the one-hot-encoding for WorkClass # * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input # In[16]: encodedDF.toPandas().head() # #### View the original and transformed fields together # In[17]: encodedDF.select('WorkClass', 'WorkClass_index', 'WorkClass_encoded').toPandas().head() # ### Transform the entire dataset # * So far we have only transformed a single column # * We need to perform this transformation for every categorical and non-numeric column # * This will be simplified by using a Pipeline (a feature of Spark ML) # #### First, split the data into training and test sets