resultsBestDtDf.write.save('/mnt/data/resultsBestDtDf.parquet', format='parquet', header=True, mode="overwrite") # COMMAND ---------- # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor # Create a RandomForestRegressor rf = RandomForestRegressor() rf.setPredictionCol("Prediction_cuisine")\ .setLabelCol("6714")\ .setFeaturesCol("features")\ .setSeed(190088121L)\ .setMaxDepth(8)\ .setNumTrees(25) # Create a Pipeline rfPipeline = Pipeline() # Set the stages of the Pipeline rfPipeline.setStages([vectorizer, rf]) # Let's first train on the entire dataset to see what we get rfModel = rfPipeline.fit(trainingSetDF) # COMMAND ----------
trainingSetDF = split80DF testSetDF = split20DF # Guardamos en cache los datos para agilizar los cáluclos trainingSetDF.cache() testSetDF.cache() # Árboles de decisión rf = RandomForestRegressor() # Para información sobre los parametros: print(rf.explainParams()) rf.setPredictionCol('Predicted_PE')\ .setLabelCol('PE')\ .setNumTrees(20)\ .setMaxDepth(5) # Forest Pipeline pipeline = Pipeline(stages = [vectorizer, rf]) # Entrenamos el modelo model = pipeline.fit(trainingSetDF) # Podemos ver los detalles del árbol creado: """ print("Nodos: " + str(model.stages[-1]._java_obj.parent().getNumTrees())) print("Profundidad: "+ str(model.stages[-1]._java_obj.parent().getMaxDepth()))