#  1. Indexing: assigning a numerical value to each data value
#  2. Encoding: creating the vector.

# First we'll index (NOTE that StringIndexer works on numeric data as well)
conditionIndexer = StringIndexer(inputCol="condition", outputCol="condition_index")

gradeIndexer = StringIndexer(inputCol="grade", outputCol="grade_index")

zipcodeIndexer = StringIndexer(inputCol="zipcode", outputCol="zipcode_index")

# Encode the indexed Categorical columns into Vectors
# Now we'll encode into vectors

# Now we'll tranform the indexed values into a vector
encoder = OneHotEncoder()
encoder.setInputCols(["condition_index", "grade_index", "zipcode_index"])\
    .setOutputCols(["condition_vector", "grade_vector", "zipcode_vector"])

# Transform all Features into a single Vector
# Transform the features into a Spark ML Vector

# Let's define our vector with only the features we actually want to use to build the model
# We'll ignore the columns above that are highly correlated to one another.

# Note that waterfront is treated as a boolean, so we didn't have to encode it.
# We can just add it to the vector assembler.
assembler = VectorAssembler(
    inputCols=["bedrooms", "bathrooms", "sqft_living", "sqft_above_percentage", "floors", "condition_vector", "grade_vector", "zipcode_vector", "waterfront"],
    outputCol="features")

# Build a Grid of Hyperparameters to test
# Here we build a Grid of hyperparameters so we can test all permutations