# MAGIC %md # MAGIC # MAGIC ## Cross Validation # COMMAND ---------- # MAGIC %md # MAGIC For each model you can run the below comand to see its params and a brief explanation of each. # COMMAND ---------- print(lr.explainParams()) # COMMAND ---------- print(gb.explainParams()) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ####Logisitic Regression - Param Grid # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation lrParamGrid = (ParamGridBuilder().addGrid( lr.regParam, [0.01, 0.5, 2.0]).addGrid(lr.elasticNetParam,
# 12. All steps from pyspark.ml.classification import GBTClassifier # 12.1 gbt = GBTClassifier(maxIter=10) gbtModel = gbt.fit(train) # 12.2 predictions = gbtModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) # 12.3 evaluator = BinaryClassificationEvaluator() evaluator.evaluate(predictions) # 12.4 gbt.explainParams() ############# GG. Gradient Boosting Cross-validation ################## # 12.5 Cross validation using parameter grid from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # 12.6 paramGrid = (ParamGridBuilder() .addGrid(gbt.maxDepth, [2, 4, 6]) .addGrid(gbt.maxBins, [20, 60]) .addGrid(gbt.maxIter, [10, 20]) .build()) # 12.7
dt = DecisionTreeClassifier() print dt.explainParams() dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print rfClassifier.explainParams() trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print gbtClassifier.explainParams() trainedModel = gbtClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() print nb.explainParams() trainedModel = nb.fit(bInput.where("label != 0")) # COMMAND ---------- from pyspark.mllib.evaluation import BinaryClassificationMetrics out = trainedModel.transform(bInput)\ .select("prediction", "label")\ .rdd.map(lambda x: (float(x[0]), float(x[1])))
dt = DecisionTreeClassifier() print(dt.explainParams()) dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print(rfClassifier.explainParams()) trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print(gbtClassifier.explainParams()) trainedModel = gbtClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() print(nb.explainParams()) trainedModel = nb.fit(bInput.where("label != 0")) # COMMAND ---------- from pyspark.mllib.evaluation import BinaryClassificationMetrics out = trainedModel.transform(bInput)\ .select("prediction", "label")\ .rdd.map(lambda x: (float(x[0]), float(x[1])))