예제 #1
0
 def regression_metrics(self, predictions: DataFrame):
     return ComputeModelStatistics(
         evaluationMetric='regression',
         labelCol=self.estimator.getLabelCol(),
         scoresCol=self.estimator.getPredictionCol())\
         .transform(predictions)\
         .toPandas().to_dict(orient='list')
예제 #2
0
model = LightGBMRegressionModel.loadNativeModelFromFile("mymodel")

# COMMAND ----------

# MAGIC %md View the feature importances of the trained model.

# COMMAND ----------

print(model.getFeatureImportances())

# COMMAND ----------

# MAGIC %md Score the regressor on the test data.

# COMMAND ----------

scoredData = model.transform(test)
display(scoredData)

# COMMAND ----------

# MAGIC %md Compute metrics using ComputeModelStatistics

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics
metrics = ComputeModelStatistics(evaluationMetric='regression',
                                 labelCol='label',
                                 scoresCol='prediction') \
            .transform(scoredData)
display(metrics)
                        labelCol="income",
                        numFeatures=256).fit(train)

# COMMAND ----------

# MAGIC %md After the model is trained, we score it against the test dataset and view metrics.

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel
prediction = model.transform(test)
prediction.printSchema()

# COMMAND ----------

metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()

# COMMAND ----------

# MAGIC %md First, we will define the webservice input/output.
# MAGIC For more information, you can visit the [documentation for Spark Serving](https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md)

# COMMAND ----------

from pyspark.sql.types import *
from mmlspark.io import *
import uuid

serving_inputs = spark.readStream.server() \
    .address("localhost", 8898, "my_api") \
예제 #4
0
# MAGIC    trained models by find the model which performs best on the `test`
# MAGIC    dataset given the specified metric
# MAGIC 
# MAGIC 3. The **`CompueModelStatistics`** Transformer computes the different
# MAGIC    metrics on a scored dataset (in our case, the `validation` dataset)
# MAGIC    at the same time

# COMMAND ----------

from mmlspark.train import TrainClassifier, ComputeModelStatistics
from mmlspark.automl import FindBestModel

# Prepare data for learning
train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)

# Train the models on the 'train' data
lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam)
                       for hyperParam in lrHyperParams]
lrmodels = [TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train)
            for lrm in logisticRegressions]

# Select the best model
bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)


# Get AUC on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's AUC on validation set = "
      + "{0:.2f}%".format(metrics.first()["AUC"] * 100))
예제 #5
0
# COMMAND ----------

# MAGIC %md Score the regressor on the test data.

# COMMAND ----------

scoredData = model.transform(testCat)
scoredData.limit(10).toPandas()

# COMMAND ----------

# MAGIC %md Compute model metrics against the entire scored dataset

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics
metrics = ComputeModelStatistics().transform(scoredData)
metrics.toPandas()

# COMMAND ----------

# MAGIC %md Finally, compute and show statistics on individual predictions in the test
# MAGIC dataset, demonstrating the usage of `ComputePerInstanceStatistics`

# COMMAND ----------

from mmlspark.train import ComputePerInstanceStatistics
evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)
evalPerInstance.select("ArrDelay", "Scores", "L1_loss", "L2_loss") \
               .limit(10).toPandas()
# MAGIC %md After the models have been trained and scored, compute some basic statistics
# MAGIC to evaluate the predictions.  The following statistics are calculated for
# MAGIC regression models to evaluate:
# MAGIC * Mean squared error
# MAGIC * Root mean squared error
# MAGIC * R^2
# MAGIC * Mean absolute error
# MAGIC
# MAGIC Use the `ComputeModelStatistics` API to compute basic statistics for
# MAGIC the Poisson and the Random Forest models.

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics
poissonMetrics = ComputeModelStatistics().transform(poissonPrediction)
print("Poisson Metrics")
poissonMetrics.toPandas()

# COMMAND ----------

randomForestMetrics = ComputeModelStatistics().transform(
    randomForestPrediction)
print("Random Forest Metrics")
randomForestMetrics.toPandas()

# COMMAND ----------

# MAGIC %md We can also compute per instance statistics for `poissonPrediction`:

# COMMAND ----------
df_test = df_test.filter(col("label") != 2.0) \
                 .withColumn("label", when(col("label") > 0, 1.0).otherwise(0.0)) \
                 .select(["label", "text"])
print("Number of test samples after filtering: ", df_test.count())

# COMMAND ----------

# Make predictions
predictions = vw_trained.transform(df_test)
predictions.limit(10).toPandas()

# COMMAND ----------

# Compute model performance metrics
metrics = ComputeModelStatistics(evaluationMetric="classification", 
                                 labelCol="label", 
                                 scoredLabelsCol="prediction").transform(predictions)
metrics.toPandas()

# COMMAND ----------

# Utility class for plotting ROC curve (https://stackoverflow.com/questions/52847408/pyspark-extract-roc-curve)
class CurveMetrics(BinaryClassificationMetrics):
    def __init__(self, *args):
        super(CurveMetrics, self).__init__(*args)

    def get_curve(self, method):
        rdd = getattr(self._java_model, method)().toJavaRDD()
        points = []
        for row in rdd.collect():
            points += [(float(row._1()), float(row._2()))]
# COMMAND ----------

# MAGIC %md Featurize images

# COMMAND ----------

featurizedImages = cntkModel.transform(imagesWithLabels).select(["features","labels"])

# COMMAND ----------

# MAGIC %md Use featurized images to train a classifier

# COMMAND ----------

from mmlspark.train import TrainClassifier
from pyspark.ml.classification import RandomForestClassifier

train,test = featurizedImages.randomSplit([0.75,0.25])

model = TrainClassifier(model=RandomForestClassifier(),labelCol="labels").fit(train)

# COMMAND ----------

# MAGIC %md Evaluate the accuracy of the model

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics
predictions = model.transform(test)
metrics = ComputeModelStatistics(evaluationMetric="accuracy").transform(predictions)
metrics.show()
예제 #9
0
import mmlspark
from mmlspark.lightgbm import LightGBMClassifier
from mmlspark.train import TrainClassifier, ComputeModelStatistics

df = spark.read.csv(r"C:\Users\yanrujing\Desktop\breast_cancer.csv",
                    header=True,
                    inferSchema=True)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=0)
print(df.limit(10).toPandas())

model = TrainClassifier(model=LogisticRegression(),
                        labelCol="class",
                        numFeatures=256).fit(train_data)
prediction = model.transform(test_data)
metrics = ComputeModelStatistics().transform(prediction)
print(metrics.limit(10).toPandas())

f1 = VectorAssembler(inputCols=[
    'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion',
    'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli',
    'mitoses'
],
                     outputCol='features')
f2 = StringIndexer(inputCol='class', outputCol='label')

p = Pipeline(stages=[f1, f2]).fit(df)
data = p.transform(df)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=0)
model = LightGBMClassifier(objective='binary').fit(train_data)
test_predict = model.transform(test_data)
lr_predictions = lr_model.transform(lr_test_data)

display(lr_predictions.limit(10).toPandas())

# COMMAND ----------

# MAGIC %md We evaluate the prediction result by using `mmlspark.train.ComputeModelStatistics` which returns four metrics:
# MAGIC * [MSE (Mean Squared Error)](https://en.wikipedia.org/wiki/Mean_squared_error)
# MAGIC * [RMSE (Root Mean Squared Error)](https://en.wikipedia.org/wiki/Root-mean-square_deviation) = sqrt(MSE)
# MAGIC * [R quared](https://en.wikipedia.org/wiki/Coefficient_of_determination)
# MAGIC * [MAE (Mean Absolute Error)](https://en.wikipedia.org/wiki/Mean_absolute_error)

# COMMAND ----------

metrics = ComputeModelStatistics(
    evaluationMetric='regression', labelCol='target',
    scoresCol='prediction').transform(lr_predictions)

results = metrics.toPandas()
results.insert(0, 'model', ['Spark MLlib - Linear Regression'])
display(results)

# COMMAND ----------

# MAGIC %md ## Vowpal Wabbit

# COMMAND ----------

# MAGIC %md Perform VW-style feature hashing. Many types (numbers, string, bool, map of string to (number, string)) are supported.

# COMMAND ----------