예제 #1
0
# COMMAND ----------

summary.objectiveHistory

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print dt.explainParams()
dtModel = dt.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print rfClassifier.explainParams()
trainedModel = rfClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print gbtClassifier.explainParams()
trainedModel = gbtClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
print nb.explainParams()
trainedModel = nb.fit(bInput.where("label != 0"))
예제 #2
0
# Do some checking on the new DataFrame, see if they look ok.
df_train.select("V1","V2","Features","Class").show(10)
df_test.select("V1","V2","Features","Class").show(10)

# Get some stats on the datasets
df_train.describe("V1","Class").show()
df_test.describe("V1","Class").show()


# ## Specify Random Forest model

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="Features", labelCol="Class", numTrees=10)

# Use the `explainParams` method to get a full list of parameters:
print(rf.explainParams())

# ## Fit the Random Forest model

# Use the `fit` method to fit the linear regression model on the train DataFrame:
%time rf_model = rf.fit(df_train)

# The result is an instance of the
# [LogisticRegressionModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel)
# class:
type(rf_model)


# ## Evaluate model performance on the test dataset.

# Use the `evaluate` method of the
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_indexed")

# Create dummy variables for `vehicle_color_indexed`:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="vehicle_color_indexed", outputCol="vehicle_color_encoded")

# Select and assemble the features:
from pyspark.ml.feature import VectorAssembler
features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Specify the estimator (i.e., classification algorithm):
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating")
print(classifier.explainParams())

# Specify the hyperparameter grid:
from pyspark.ml.tuning import ParamGridBuilder
maxDepthList = [5, 10, 20]
numTreesList = [20, 50, 100]
subsamplingRateList = [0.5, 1.0]
paramGrid = ParamGridBuilder() \
  .addGrid(classifier.maxDepth, maxDepthList) \
  .addGrid(classifier.numTrees, numTreesList) \
  .addGrid(classifier.subsamplingRate, subsamplingRateList) \
  .build()

# Specify the evaluator:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", metricName="accuracy")
# COMMAND ----------

summary.objectiveHistory

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print(dt.explainParams())
dtModel = dt.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print(rfClassifier.explainParams())
trainedModel = rfClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print(gbtClassifier.explainParams())
trainedModel = gbtClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
print(nb.explainParams())
trainedModel = nb.fit(bInput.where("label != 0"))