# MAGIC A Decision Tree creates a model based on splitting variables using a tree structure. We will first start with a single decision tree model. # MAGIC # MAGIC Reference Decision Trees: https://en.wikipedia.org/wiki/Decision_tree_learning # COMMAND ---------- # MAGIC %md # MAGIC ### Decision Tree Models # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.setLabelCol("PE") dt.setPredictionCol("Predicted_PE") dt.setFeaturesCol("features") dt.setMaxBins(100) dtPipeline = Pipeline() dtPipeline.setStages([vectorizer, dt]) # Let's just resuse our CrossValidator crossval.setEstimator(dtPipeline) paramGrid = ParamGridBuilder()\ .addGrid(dt.maxDepth, range(2, 8))\ .build() crossval.setEstimatorParamMaps(paramGrid) dtModel = crossval.fit(trainingSet)
from pyspark.ml.feature import VectorAssembler vectorizer = VectorAssembler() vectorizer.setInputCols(columns[:-2]) vectorizer.setOutputCol("features") # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml import Pipeline # Create a DecisionTreeRegressor dt = DecisionTreeRegressor() dt.setPredictionCol("Prediction_cuisine")\ .setLabelCol("6714")\ .setFeaturesCol("features")\ .setMaxBins(100) # Create a Pipeline dtPipeline = Pipeline() # Set the stages of the Pipeline dtPipeline.setStages([vectorizer, dt]) # Let's first train on the entire dataset to see what we get dtModel = dtPipeline.fit(trainingSetDF) # COMMAND ---------- resultsDtDf = dtModel.transform(testSetDF) resultsDtDf.write.save('/mnt/data/resultsDtDf.parquet',
display(trainingSetDF) # Definir un vector de ensamblado para que las variables de entrada se queden en una sola "features" vectorizer = VectorAssembler() vectorizer.setInputCols(["AT", "V", "AP", "RH"]) vectorizer.setOutputCol("features") # Definir molelo de arbol de regresión dt = DecisionTreeRegressor() # Definir los parametros del modelo: # - Predicted_PE: columna que almacenará las predicciones estimadas # - features: columna que almacena el vector de variables predictoras # - PE: columna que almacena la predicción real # - 8 niveles de profundidad dt.setPredictionCol("Predicted_PE").setMaxBins(100).setFeaturesCol( "features").setLabelCol("PE").setMaxDepth(8) # Crear una 'pipeline' en la cual hay 2 elementos, # un 'Vector Assembler' y un modelo 'Decision Tree', # accesibles mediante el atributo 'stages'. pipeline = Pipeline(stages=[vectorizer, dt]) # Ajustar el modelo (Ejecutar) model = pipeline.fit(trainingSetDF) # Visualizar los resultados vectAssembler = model.stages[0] dtModel = model.stages[1] print("Nodos: " + str(dtModel.numNodes)) print("Profundidad: " + str(dtModel.depth)) # summary only print(dtModel.toDebugString)