stages = [
        VectorAssembler(inputCols=featureCols, outputCol="va"),
        StandardScaler(inputCol="va", outputCol="features"),
        StringIndexer(inputCol="status", outputCol="label"), cv
    ]
    pipeline = Pipeline(stages=stages)

    pipelineTrained = pipeline.fit(training)

    predictions = pipelineTrained.transform(test)
    metrics = MulticlassMetrics(
        predictions.select(['prediction', 'label']).rdd)

    mlflow.log_metric("precision", metrics.precision(1.0))
    mlflow.log_metric("recall", metrics.recall(1.0))
    mlflow.log_metric("f1", metrics.fMeasure(1.0))

    mlflow.spark.log_model(pipelineTrained, "turbine_gbt")
    mlflow.set_tag("model", "turbine_gbt")

# COMMAND ----------

# MAGIC %md ## Save to the model registry
# MAGIC Get the model having the best metrics.AUROC from the registry

# COMMAND ----------

best_models = mlflow.search_runs(
    filter_string=
    'tags.model="turbine_gbt" and attributes.status = "FINISHED" and metrics.f1 > 0',
    order_by=['metrics.f1 DESC'],
Пример #2
0
  # Define pre-processing pipeline
  featureCols = ["AN3", "AN4", "AN5", "AN6", "AN7", "AN8", "AN9", "AN10"]
  stages = [VectorAssembler(inputCols=featureCols, outputCol="va"),
            StandardScaler(inputCol="va", outputCol="features"),
            StringIndexer(inputCol="status", outputCol="label"), cv]
  pipeline = Pipeline(stages=stages)

  pipelineTrained = pipeline.fit(training)
  
  predictions = pipelineTrained.transform(test)
  metrics = MulticlassMetrics(predictions.select(['prediction', 'label']).rdd)
  
  # Define mlflow artifacts to log with the experiment run
  mlflow.log_metric("precision", metrics.precision(1.0))
  mlflow.log_metric("recall", metrics.recall(1.0))
  mlflow.log_metric("f1", metrics.fMeasure(1.0))
  
  mlflow.spark.log_model(pipelineTrained, "turbine_anomalies")
  mlflow.set_tag("model", "gbt") 
  
  # Add confusion matrix to the model
  labels = pipelineTrained.stages[2].labels
  fig = plt.figure()
  sn.heatmap(pd.DataFrame(metrics.confusionMatrix().toArray()), annot=True, fmt='g', xticklabels=labels, yticklabels=labels)
  plt.suptitle("Turbine Damage Prediction. F1={:.2f}".format(metrics.fMeasure(1.0)), fontsize = 18)
  plt.xlabel("Predicted Labels")
  plt.ylabel("True Labels")
  mlflow.log_figure(fig, "confusion_matrix.png") # needs mlflow version >=1.13.1

# COMMAND ----------