# 1. Indexing: assigning a numerical value to each data value # 2. Encoding: creating the vector. # First we'll index (NOTE that StringIndexer works on numeric data as well) conditionIndexer = StringIndexer(inputCol="condition", outputCol="condition_index") gradeIndexer = StringIndexer(inputCol="grade", outputCol="grade_index") zipcodeIndexer = StringIndexer(inputCol="zipcode", outputCol="zipcode_index") # Encode the indexed Categorical columns into Vectors # Now we'll encode into vectors # Now we'll tranform the indexed values into a vector encoder = OneHotEncoder() encoder.setInputCols(["condition_index", "grade_index", "zipcode_index"])\ .setOutputCols(["condition_vector", "grade_vector", "zipcode_vector"]) # Transform all Features into a single Vector # Transform the features into a Spark ML Vector # Let's define our vector with only the features we actually want to use to build the model # We'll ignore the columns above that are highly correlated to one another. # Note that waterfront is treated as a boolean, so we didn't have to encode it. # We can just add it to the vector assembler. assembler = VectorAssembler( inputCols=["bedrooms", "bathrooms", "sqft_living", "sqft_above_percentage", "floors", "condition_vector", "grade_vector", "zipcode_vector", "waterfront"], outputCol="features") # Build a Grid of Hyperparameters to test # Here we build a Grid of hyperparameters so we can test all permutations