# COMMAND ---------- summary.objectiveHistory # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier() print dt.explainParams() dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print rfClassifier.explainParams() trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print gbtClassifier.explainParams() trainedModel = gbtClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() print nb.explainParams() trainedModel = nb.fit(bInput.where("label != 0"))
# Do some checking on the new DataFrame, see if they look ok. df_train.select("V1","V2","Features","Class").show(10) df_test.select("V1","V2","Features","Class").show(10) # Get some stats on the datasets df_train.describe("V1","Class").show() df_test.describe("V1","Class").show() # ## Specify Random Forest model from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(featuresCol="Features", labelCol="Class", numTrees=10) # Use the `explainParams` method to get a full list of parameters: print(rf.explainParams()) # ## Fit the Random Forest model # Use the `fit` method to fit the linear regression model on the train DataFrame: %time rf_model = rf.fit(df_train) # The result is an instance of the # [LogisticRegressionModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel) # class: type(rf_model) # ## Evaluate model performance on the test dataset. # Use the `evaluate` method of the
from pyspark.ml.feature import StringIndexer indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_indexed") # Create dummy variables for `vehicle_color_indexed`: from pyspark.ml.feature import OneHotEncoder encoder = OneHotEncoder(inputCol="vehicle_color_indexed", outputCol="vehicle_color_encoded") # Select and assemble the features: from pyspark.ml.feature import VectorAssembler features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"] assembler = VectorAssembler(inputCols=features, outputCol="features") # Specify the estimator (i.e., classification algorithm): from pyspark.ml.classification import RandomForestClassifier classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating") print(classifier.explainParams()) # Specify the hyperparameter grid: from pyspark.ml.tuning import ParamGridBuilder maxDepthList = [5, 10, 20] numTreesList = [20, 50, 100] subsamplingRateList = [0.5, 1.0] paramGrid = ParamGridBuilder() \ .addGrid(classifier.maxDepth, maxDepthList) \ .addGrid(classifier.numTrees, numTreesList) \ .addGrid(classifier.subsamplingRate, subsamplingRateList) \ .build() # Specify the evaluator: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", metricName="accuracy")
# COMMAND ---------- summary.objectiveHistory # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier() print(dt.explainParams()) dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print(rfClassifier.explainParams()) trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print(gbtClassifier.explainParams()) trainedModel = gbtClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import NaiveBayes nb = NaiveBayes() print(nb.explainParams()) trainedModel = nb.fit(bInput.where("label != 0"))