# MAGIC
# MAGIC A Decision Tree creates a model based on splitting variables using a tree structure. We will first start with a single decision tree model.
# MAGIC
# MAGIC Reference Decision Trees: https://en.wikipedia.org/wiki/Decision_tree_learning

# COMMAND ----------

# MAGIC %md
# MAGIC ###  Decision Tree Models

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.setLabelCol("PE")
dt.setPredictionCol("Predicted_PE")
dt.setFeaturesCol("features")
dt.setMaxBins(100)

dtPipeline = Pipeline()
dtPipeline.setStages([vectorizer, dt])
# Let's just resuse our CrossValidator

crossval.setEstimator(dtPipeline)

paramGrid = ParamGridBuilder()\
  .addGrid(dt.maxDepth, range(2, 8))\
  .build()
crossval.setEstimatorParamMaps(paramGrid)
Пример #2
0
cvModel = crossVal.fit(trainSetDF).bestModel

predictionsAndLabelsDF = cvModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE")

# Create new rmse
rmseNew = regEval.evaluate(predictionsAndLabelsDF)

# Get R-Squared based on the new model
r2New = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"})

# print("Regularization parameter of the best model: {0:.2f}".format(cvModel.stages[-1]._java_obj.parent().getRegParam()))


# Create a DecisionTree
decisionTree = DecisionTreeRegressor()
decisionTree.setLabelCol('Power_Output').setPredictionCol('Predicted_PE').setFeaturesCol('features').setMaxBins(100)
dtPipeline = Pipeline()

# Set stages
dtPipeline.setStages([vectorizer, decisionTree])
# print(decisionTree.maxDepth)

# Reuse the cross validator
crossVal.setEstimator(dtPipeline)

# Tune over dt.maxDepth parameter on the values 2 and 3, create a parameter grid using the ParamGridBuilder
paramGrid = ParamGridBuilder().addGrid(decisionTree.maxDepth, [2, 3]).build()

# Add grid to cross validator & get the best model
crossVal.setEstimatorParamMaps(paramGrid)
dtModel = crossVal.fit(trainSetDF).bestModel
train_data= train_data.na.fill({'TOTAL_BENEFICIARY_AMT':0})

from pyspark.ml.feature import VectorAssembler
vectorizer = VectorAssembler()
#datasetDF.select(datasetDF['PE'].alias('features')).show()
vectorizer.setInputCols(["ICD9_DGNS_CD_1N","ICD9_DGNS_CD_2N","ICD9_PRCDR_CD_1N","ICD9_PRCDR_CD_2N"])
vectorizer.setOutputCol("features")

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline

# Create a DecisionTreeRegressor
dt = DecisionTreeRegressor(maxDepth = 8)

dt.setLabelCol("TOTAL_BENEFICIARY_AMT")\
  .setPredictionCol("Predicted_EXP")\
  .setFeaturesCol("features")\
  .setMaxBins(10000)


# Create a Pipeline
dtPipeline = Pipeline()

# Set the stages of the Pipeline
dtPipeline.setStages([vectorizer, dt])
model = dtPipeline.fit(train_data)
train_data_output=model.transform(train_data)

from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2")