# MAGIC # MAGIC A Decision Tree creates a model based on splitting variables using a tree structure. We will first start with a single decision tree model. # MAGIC # MAGIC Reference Decision Trees: https://en.wikipedia.org/wiki/Decision_tree_learning # COMMAND ---------- # MAGIC %md # MAGIC ### Decision Tree Models # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.setLabelCol("PE") dt.setPredictionCol("Predicted_PE") dt.setFeaturesCol("features") dt.setMaxBins(100) dtPipeline = Pipeline() dtPipeline.setStages([vectorizer, dt]) # Let's just resuse our CrossValidator crossval.setEstimator(dtPipeline) paramGrid = ParamGridBuilder()\ .addGrid(dt.maxDepth, range(2, 8))\ .build() crossval.setEstimatorParamMaps(paramGrid)
cvModel = crossVal.fit(trainSetDF).bestModel predictionsAndLabelsDF = cvModel.transform(testSetDF).select("Atmospheric_Temperature", "Vacuum_Speed", "Atmospheric_Pressure", "Relative_Humidity", "Power_Output", "Predicted_PE") # Create new rmse rmseNew = regEval.evaluate(predictionsAndLabelsDF) # Get R-Squared based on the new model r2New = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: "r2"}) # print("Regularization parameter of the best model: {0:.2f}".format(cvModel.stages[-1]._java_obj.parent().getRegParam())) # Create a DecisionTree decisionTree = DecisionTreeRegressor() decisionTree.setLabelCol('Power_Output').setPredictionCol('Predicted_PE').setFeaturesCol('features').setMaxBins(100) dtPipeline = Pipeline() # Set stages dtPipeline.setStages([vectorizer, decisionTree]) # print(decisionTree.maxDepth) # Reuse the cross validator crossVal.setEstimator(dtPipeline) # Tune over dt.maxDepth parameter on the values 2 and 3, create a parameter grid using the ParamGridBuilder paramGrid = ParamGridBuilder().addGrid(decisionTree.maxDepth, [2, 3]).build() # Add grid to cross validator & get the best model crossVal.setEstimatorParamMaps(paramGrid) dtModel = crossVal.fit(trainSetDF).bestModel
train_data= train_data.na.fill({'TOTAL_BENEFICIARY_AMT':0}) from pyspark.ml.feature import VectorAssembler vectorizer = VectorAssembler() #datasetDF.select(datasetDF['PE'].alias('features')).show() vectorizer.setInputCols(["ICD9_DGNS_CD_1N","ICD9_DGNS_CD_2N","ICD9_PRCDR_CD_1N","ICD9_PRCDR_CD_2N"]) vectorizer.setOutputCol("features") from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml import Pipeline # Create a DecisionTreeRegressor dt = DecisionTreeRegressor(maxDepth = 8) dt.setLabelCol("TOTAL_BENEFICIARY_AMT")\ .setPredictionCol("Predicted_EXP")\ .setFeaturesCol("features")\ .setMaxBins(10000) # Create a Pipeline dtPipeline = Pipeline() # Set the stages of the Pipeline dtPipeline.setStages([vectorizer, dt]) model = dtPipeline.fit(train_data) train_data_output=model.transform(train_data) from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2")