예제 #1
0
# Use the new indexed field to obtain a one-hot-encoded field

# In[15]:

from pyspark.ml.feature import OneHotEncoder

encodedDF = OneHotEncoder(inputCol="WorkClass_index",
                          outputCol="WorkClass_encoded").transform(indexedDF)

# #### A WorkClass_encoded field is created
# * This contains the one-hot-encoding for WorkClass
# * This cannot operate directly on a column with string values - values need to be numeric. Hence we use the WorkClass_index as input

# In[16]:

encodedDF.toPandas().head()

# #### View the original and transformed fields together

# In[17]:

encodedDF.select('WorkClass', 'WorkClass_index',
                 'WorkClass_encoded').toPandas().head()

# ### Transform the entire dataset
# * So far we have only transformed a single column
# * We need to perform this transformation for every categorical and non-numeric column
# * This will be simplified by using a Pipeline (a feature of Spark ML)

# ####  First, split the data into training and test sets