stages = [ VectorAssembler(inputCols=featureCols, outputCol="va"), StandardScaler(inputCol="va", outputCol="features"), StringIndexer(inputCol="status", outputCol="label"), cv ] pipeline = Pipeline(stages=stages) pipelineTrained = pipeline.fit(training) predictions = pipelineTrained.transform(test) metrics = MulticlassMetrics( predictions.select(['prediction', 'label']).rdd) mlflow.log_metric("precision", metrics.precision(1.0)) mlflow.log_metric("recall", metrics.recall(1.0)) mlflow.log_metric("f1", metrics.fMeasure(1.0)) mlflow.spark.log_model(pipelineTrained, "turbine_gbt") mlflow.set_tag("model", "turbine_gbt") # COMMAND ---------- # MAGIC %md ## Save to the model registry # MAGIC Get the model having the best metrics.AUROC from the registry # COMMAND ---------- best_models = mlflow.search_runs( filter_string= 'tags.model="turbine_gbt" and attributes.status = "FINISHED" and metrics.f1 > 0', order_by=['metrics.f1 DESC'],
# Define pre-processing pipeline featureCols = ["AN3", "AN4", "AN5", "AN6", "AN7", "AN8", "AN9", "AN10"] stages = [VectorAssembler(inputCols=featureCols, outputCol="va"), StandardScaler(inputCol="va", outputCol="features"), StringIndexer(inputCol="status", outputCol="label"), cv] pipeline = Pipeline(stages=stages) pipelineTrained = pipeline.fit(training) predictions = pipelineTrained.transform(test) metrics = MulticlassMetrics(predictions.select(['prediction', 'label']).rdd) # Define mlflow artifacts to log with the experiment run mlflow.log_metric("precision", metrics.precision(1.0)) mlflow.log_metric("recall", metrics.recall(1.0)) mlflow.log_metric("f1", metrics.fMeasure(1.0)) mlflow.spark.log_model(pipelineTrained, "turbine_anomalies") mlflow.set_tag("model", "gbt") # Add confusion matrix to the model labels = pipelineTrained.stages[2].labels fig = plt.figure() sn.heatmap(pd.DataFrame(metrics.confusionMatrix().toArray()), annot=True, fmt='g', xticklabels=labels, yticklabels=labels) plt.suptitle("Turbine Damage Prediction. F1={:.2f}".format(metrics.fMeasure(1.0)), fontsize = 18) plt.xlabel("Predicted Labels") plt.ylabel("True Labels") mlflow.log_figure(fig, "confusion_matrix.png") # needs mlflow version >=1.13.1 # COMMAND ----------