# MAGIC A Decision Tree creates a model based on splitting variables using a tree structure. We will first start with a single decision tree model.
# MAGIC
# MAGIC Reference Decision Trees: https://en.wikipedia.org/wiki/Decision_tree_learning

# COMMAND ----------

# MAGIC %md
# MAGIC ###  Decision Tree Models

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.setLabelCol("PE")
dt.setPredictionCol("Predicted_PE")
dt.setFeaturesCol("features")
dt.setMaxBins(100)

dtPipeline = Pipeline()
dtPipeline.setStages([vectorizer, dt])
# Let's just resuse our CrossValidator

crossval.setEstimator(dtPipeline)

paramGrid = ParamGridBuilder()\
  .addGrid(dt.maxDepth, range(2, 8))\
  .build()
crossval.setEstimatorParamMaps(paramGrid)

dtModel = crossval.fit(trainingSet)
예제 #2
0
from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler()
vectorizer.setInputCols(columns[:-2])
vectorizer.setOutputCol("features")

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor()

dt.setPredictionCol("Prediction_cuisine")\
  .setLabelCol("6714")\
  .setFeaturesCol("features")\
  .setMaxBins(100)

# Create a Pipeline
dtPipeline = Pipeline()

# Set the stages of the Pipeline
dtPipeline.setStages([vectorizer, dt])

# Let's first train on the entire dataset to see what we get
dtModel = dtPipeline.fit(trainingSetDF)

# COMMAND ----------

resultsDtDf = dtModel.transform(testSetDF)
resultsDtDf.write.save('/mnt/data/resultsDtDf.parquet',
예제 #3
0
display(trainingSetDF)

# Definir un vector de ensamblado para que las variables de entrada se queden en una sola "features"
vectorizer = VectorAssembler()
vectorizer.setInputCols(["AT", "V", "AP", "RH"])
vectorizer.setOutputCol("features")

# Definir molelo de arbol de regresión
dt = DecisionTreeRegressor()

# Definir los parametros del modelo:
# - Predicted_PE: columna que almacenará las predicciones estimadas
# - features: columna que almacena el vector de variables predictoras
# - PE: columna que almacena la predicción real
# - 8 niveles de profundidad
dt.setPredictionCol("Predicted_PE").setMaxBins(100).setFeaturesCol(
    "features").setLabelCol("PE").setMaxDepth(8)

# Crear una 'pipeline' en la cual hay 2 elementos,
# un 'Vector Assembler' y un modelo 'Decision Tree',
# accesibles mediante el atributo 'stages'.
pipeline = Pipeline(stages=[vectorizer, dt])

# Ajustar el modelo (Ejecutar)
model = pipeline.fit(trainingSetDF)

# Visualizar los resultados
vectAssembler = model.stages[0]
dtModel = model.stages[1]
print("Nodos: " + str(dtModel.numNodes))
print("Profundidad: " + str(dtModel.depth))  # summary only
print(dtModel.toDebugString)