def register_best_model(**kwargs): """ Take the best performing model, register it under the BestModel name, and ship it to prod """ run_id = kwargs["ti"].xcom_pull(task_ids="get_best_model", key="best_model_run_id") model_uri = f"runs:/{run_id}/model" model_details = mlflow.register_model(model_uri, "BestModel") # note this doesnt put it in prod, but updates the registered model in the model repo # This is what would make it prod, but probably shouldnt automate this without some testing and eyes on client = MlflowClient() client.transition_model_version_stage(name=model_details.name, version=model_details.version, stage='Production')
time.sleep(5) # COMMAND ---------- # MAGIC %md ### Perform a model stage transition # MAGIC # MAGIC The MLflow Model Registry defines several model stages: **None**, **Staging**, **Production**, and **Archived**. Each stage has a unique meaning. For example, **Staging** is meant for model testing, while **Production** is for models that have completed the testing or review processes and have been deployed to applications. # COMMAND ---------- from mlflow.tracking.client import MlflowClient client = MlflowClient() client.transition_model_version_stage( name=modelDetails.name, version=modelDetails.version, stage='Production', ) # COMMAND ---------- # MAGIC %md The MLflow Model Registry allows multiple model versions to share the same stage. When referencing a model by stage, the Model Registry will use the latest model version (the model version with the largest version ID). The `MlflowClient.get_latest_versions()` function fetches the latest model version for a given stage or set of stages. The following cell uses this function to print the latest version of the power forecasting model that is in the `Production` stage. # COMMAND ---------- latestVersionInfo = client.get_latest_versions(modelName, stages=["Production"]) latestVersion = latestVersionInfo[0].version print("The latest production version of the model '%s' is '%s'." % (modelName, latestVersion))
model_name = "linear-regression-model" artifact_path = "best_model" model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path) registered_model = mlflow.register_model(model_uri=model_uri, name=model_name, await_registration_for=120) #Add model and model version descriptions to Model Registry client.update_model_version( name=registered_model.name, version=registered_model.version, description="This predicts the age of a customer using transaction history." ) #Transition a model version to Staging/Prod/Archived client.transition_model_version_stage( name=registered_model.name, version=registered_model.version, stage='Staging', ) #Get model version details model_version = client.get_model_version( name=registered_model.name, version=registered_model.version, ) #Load a specific model version from the Model Registry model_uri = "models:/{model_name}/staging".format(model_name=model_name) spark_model = mlflow.spark.load_model(model_uri) # COMMAND ----------
f"Bad current stage '{current_stage}' for model version {model_version}. Should be None or Staging." ) # COMMAND ---------- # MAGIC %md ## Make predictions on test data # COMMAND ---------- data = spark.read.format("delta").load(input_data_path) preds = data.withColumn( "prediction", model_udf(*data.drop("quality").columns)).select( "quality", "prediction").toPandas() # it's okay since dataframe is small mse = mean_squared_error(preds["quality"], preds["prediction"]) print(f"MSE: {mse}") # COMMAND ---------- # MAGIC %md ## Test succeeded: transition the model to Staging # COMMAND ---------- client.transition_model_version_stage( name=model_name, version=model_version, stage="Staging", ) # COMMAND ----------
client.update_registered_model( name=model_name, description="This model forecasts the wine quality based on the characteristics." ) client.update_model_version( name=model_name, version=model_version, description="This model version was built using sklearn." ) # COMMAND ---------- client.transition_model_version_stage( name=model_name, version=model_version, stage=stage, archive_existing_versions=True ) model_version_details = client.get_model_version( name=model_name, version=model_version, ) print("The current model stage is: '{stage}'".format(stage=model_version_details.current_stage)) latest_version_info = client.get_latest_versions(model_name, stages=[stage]) latest_production_version = latest_version_info[0].version print("The latest production version of the model '%s' is '%s'." % (model_name, latest_production_version)) # COMMAND ----------
# MAGIC # MAGIC If you have permission to transition a model to a particular stage, you can make the transition directly by using the `MlflowClient.update_model_version()` function. If you do not have permission, you can request a stage transition using the REST API; for example: # MAGIC # MAGIC ``` # MAGIC %sh curl -i -X POST -H "X-Databricks-Org-Id: <YOUR_ORG_ID>" -H "Authorization: Bearer <YOUR_ACCESS_TOKEN>" https://<YOUR_DATABRICKS_WORKSPACE_URL>/api/2.0/preview/mlflow/transition-requests/create -d '{"comment": "Please move this model into production!", "model_version": {"version": 1, "registered_model": {"name": "power-forecasting-model"}}, "stage": "Production"}' # MAGIC ``` # COMMAND ---------- # MAGIC %md Now that you've learned about stage transitions, transition the model to the `Production` stage. # COMMAND ---------- client.transition_model_version_stage( name=model_details.name, version=model_details.version, stage='Production', ) # COMMAND ---------- # MAGIC %md Use the `MlflowClient.get_model_version()` function to fetch the model's current stage. # COMMAND ---------- model_version_details = client.get_model_version( name=model_details.name, version=model_details.version, ) print("The current model stage is: '{stage}'".format(stage=model_version_details.current_stage))
def train(data_conf, model_conf, **kwargs): try: print("-----------------------------------") print("Starting Cashflow DL Model Training") print("-----------------------------------") print() # ============================== # 0. Main parameters definitions # ============================== # Size of X and y arrays definition N_days_X, N_days_y = int(data_conf['number_of_historical_days']), int( data_conf['number_of_predicted_days']) #365, 92 print('Number of days used for prediction (X): ', N_days_X) print('Number of days predicted (y): ', N_days_y) print() # Date range definition start_date, end_date = data_conf['start_date'], data_conf['end_date'] start_date_dt, end_date_dt, start_date_prediction, end_date_prediction, end_date_plusOneDay, end_date_minus_6month = dates_definitions( start_date, end_date, N_days_X, N_days_y) print('Date range: ', start_date, end_date) print() model_name = model_conf['model_name'] except Exception as e: print("Errored on initialization") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ======================================== # T.1 Pre-processing before model training # ======================================== # Loading dataset table_in = data_conf[environment]['table_to_train_on'] #ts_balance = spark.read.parquet("/mnt/test/{0}.parquet".format(table_in)).cache() ts_balance = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_in)) # Cleaning of the time series ts_balance = ts_balance.withColumn( 'balance', ts_balance.balance.cast("array<float>")) ts_balance = ts_balance.withColumn( 'keep_ts', F.udf(lambda x, y: time_series_cleaning(x, y), "int")('balance', F.lit(20)) ) #at least 10 transactions in the ts, to be used in the training ts_balance = ts_balance.where('keep_ts == 1') # Creating the dataset on which we train (and test and validate) the model ts_balance_model = ts_balance.sample( False, 0.7, seed=0) #now 0.7, but in real case would be 0.1 at best... or 0.05 print('ts_balance_model.count()', ts_balance_model.count()) # Pre-processing before model training ts_balance_model = pre_processing(ts_balance_model, end_date, spark, serving=False) ts_balance_model.show(3) print('ts_balance_model.rdd.getNumPartitions()', ts_balance_model.rdd.getNumPartitions()) ts_balance_model.show(3) # Saving prepared dataset table_out = 'cashflow_training_step1' #ts_balance_model.write.format("parquet").mode("overwrite").save("/mnt/test/{0}.parquet".format(table_out)) ts_balance_model.write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_out)) except Exception as e: print("Errored on step T.1: pre-processing before model training") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ======================================== # T.2 Generating TRAIN, VAL, TEST datasets # ======================================== # Loading datasets table_model = 'cashflow_training_step1' #ts_balance_model = spark.read.parquet("/mnt/test/{0}.parquet".format(table_model)).cache() ts_balance_model = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_model)).cache() ts_balance_model.show(3) print('ts_balance_model.count()', ts_balance_model.count()) print('ts_balance_model.rdd.getNumPartitions()', ts_balance_model.rdd.getNumPartitions()) train_set, val_set, test_set = ts_balance_model.randomSplit( [0.6, 0.2, 0.2], seed=12345) train_set.show(3) print( 'train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()', train_set.rdd.getNumPartitions(), val_set.rdd.getNumPartitions(), test_set.rdd.getNumPartitions()) # Saving prepared datasets (train, val, test sets to parquet) table_train = 'cashflow_train' table_val = 'cashflow_val' table_test = data_conf[environment][ 'table_test_for_performance'] #'cashflow_test' train_set.select('X', 'y').write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_train)) val_set.select('X', 'y').write.format("delta").mode("overwrite").save( "/mnt/delta/{0}".format(table_val)) test_set.select('primaryaccountholder','transactiondate','balance')\ .write.format("delta").mode("overwrite").save("/mnt/delta/{0}".format(table_test)) except Exception as e: print("Errored on step T.2: pre-processings") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e try: # ============================== # T.3 MODEL DEFINITION AND TRAIN # ============================== table_train = 'cashflow_train' table_val = 'cashflow_val' #table_train = spark.read.parquet("/mnt/test/{0}.parquet".format(table_train)) table_train = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_train)) #table_val = spark.read.parquet("/mnt/test/{0}.parquet".format(table_val)) table_val = spark.read.format("delta").load( "/mnt/delta/{0}".format(table_val)) table_train_count = table_train.count() table_val_count = table_val.count() #table_train_count, table_val_count from pyspark.sql.functions import col from petastorm.spark import SparkDatasetConverter, make_spark_converter # Set a cache directory on DBFS FUSE for intermediate data. spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, "file:///dbfs/tmp/petastorm/cache") converter_train = make_spark_converter(table_train) converter_val = make_spark_converter(table_val) print(f"train: {len(converter_train)}, val: {len(converter_val)}") def get_compiled_model(N_days_X, N_days_y, model_conf): #lr=0.001 #model = get_model(lr=lr) model = define_1dcnn_model(N_days_X, N_days_y, model_conf) hyperparameters = model_conf['hyperParameters'] opt = tf.keras.optimizers.Adam() # Model compilation model.compile(optimizer=opt, loss=hyperparameters['loss']) return model # Enable auto-logging to MLflow to capture TensorBoard metrics. mlflow.tensorflow.autolog(every_n_iter=1) model_name = model_conf['model_name'] mlflow_model_name = model_name model_dir = "/tmp/" + model_name try: dbutils.fs.rm(model_dir, recurse=True) except OSError: pass with mlflow.start_run(): NUM_EPOCHS = model_conf['hyperParameters']['epochs'] #5 BATCH_SIZE = model_conf['hyperParameters']['batch_size'] #500 def train_and_evaluate(N_days_X, N_days_y, model_conf): #lr=0.001 model = get_compiled_model(N_days_X, N_days_y, model_conf) #lr with converter_train.make_tf_dataset(batch_size=BATCH_SIZE) as train_dataset, \ converter_val.make_tf_dataset(batch_size=BATCH_SIZE) as val_dataset: #train_dataset = train_dataset.map(lambda x: (x.features, x.label_index)) train_dataset = train_dataset.map( lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]), tf.reshape(x.y, [-1, N_days_y]))) steps_per_epoch = len(converter_train) // BATCH_SIZE #val_dataset = val_dataset.map(lambda x: (x.features, x.label_index)) val_dataset = val_dataset.map( lambda x: (tf.reshape(x.X, [-1, N_days_X, 1]), tf.reshape(x.y, [-1, N_days_y]))) validation_steps = max(1, len(converter_val) // BATCH_SIZE) print( f"steps_per_epoch: {steps_per_epoch}, validation_steps: {validation_steps}" ) hist = model.fit(train_dataset, steps_per_epoch=steps_per_epoch, epochs=NUM_EPOCHS, validation_data=val_dataset, validation_steps=validation_steps, verbose=2) return model, hist model, hist = train_and_evaluate(N_days_X, N_days_y, model_conf) print(hist.history['val_loss'][-1]) #MLflow logging #mlflow.log_artifact(cwd + "data.json") #mlflow.log_artifact(cwd + "config.json") mlflow.log_param("model_name", str(model_name)) mlflow.log_param("N_days_X", N_days_X) mlflow.log_param("N_days_y", N_days_y) mlflow.log_param("start_date", start_date) mlflow.log_param("end_date", end_date) mlflow.log_param("num_epochs", str(NUM_EPOCHS)) mlflow.log_param("batch_size", str(BATCH_SIZE)) #mlflow.log_param("steps_per_epoch", str(steps_per_epoch)) #validation_steps # saving using tf.keras.models.save_model tf.keras.models.save_model(model, filepath=model_dir + '/model') #SavedModel format #model.save(filepath=model_dir+'model', save_format="h5") #H5 format (todo, and look how to register that) # saving using mlflow.tensorflow.save_model (this does NOT log nor register the model) does not overwrites... #mlflow.tensorflow.save_model(tf_saved_model_dir=model_dir+'/model', # tf_meta_graph_tags=[tf.compat.v1.saved_model.tag_constants.SERVING], # tf_signature_def_key='serving_default', # path = 'model') # logging already saved model mlflow.tensorflow.log_model( tf_saved_model_dir=model_dir + '/model', tf_meta_graph_tags=[ tf.compat.v1.saved_model.tag_constants.SERVING ], tf_signature_def_key='serving_default', registered_model_name=model_name, artifact_path='model') # Getting the version number of the newly registered MLflow model (useful for next steps) mlflow_model_version = 0 client_current_model = MlflowClient() for mv in client_current_model.search_model_versions( "name='{0}'".format(mlflow_model_name)): #if int(dict(mv)['version']) == mlflow_model_version: if int( dict(mv)['version'] ) >= mlflow_model_version: # finding the last version registered mlflow_model_version = int(dict(mv)['version']) model_dict = dict(mv) #update 2020-07017: to grab the latest model version, we can also do like this: (TO BE TESTED!!!) #model_version_infos = client_current_model.search_model_versions(f"name = '{model_name}'") #mlflow_model_version = max([model_version_info.version for model_version_info in model_version_infos]) # Wait until the model is ready def wait_until_model_ready(model_name, model_version): client = MlflowClient() for _ in range(20): model_version_details = client.get_model_version( name=model_name, version=model_version, ) status = ModelVersionStatus.from_string( model_version_details.status) print("Model status: %s" % ModelVersionStatus.to_string(status)) if status == ModelVersionStatus.READY: break tm.sleep(5) wait_until_model_ready(mlflow_model_name, mlflow_model_version) # Transition the registered model stage from "None" to "Staging" client_current_model.transition_model_version_stage( name=mlflow_model_name, version=mlflow_model_version, stage="Staging", ) # Copy the file from the driver node and save it to DBFS (so that they can be accessed e.g. after the current cluster terminates.): dbutils.fs.cp("file:/tmp/{0}/model".format(model_name), "dbfs:/mnt/test/{0}/model".format(model_name), recurse=True) print('Model copied here: ', "dbfs:/mnt/test/{0}/model/".format(model_name)) #mlflow.end_run() except Exception as e: print("Errored on step T.3: model definition and train") print("Exception Trace: {0}".format(e)) print(traceback.format_exc()) raise e
mlflow_model_stage = 'Staging' client = MlflowClient() for mv in client.search_model_versions( "name='{0}'".format(mlflow_model_name)): if dict(mv)['current_stage'] == mlflow_model_stage: model_dict = dict(mv) print('Model extracted run_id: ', model_dict['run_id']) print('Model extracted version number: ', model_dict['version']) print('Model extracted stage: ', model_dict['current_stage']) # Transition the registered model stage from "None" to "Staging" client.transition_model_version_stage( name=mlflow_model_name, version=model_dict['version'], stage="Production", ) print() print('Model transitioned to Production') break # COMMAND ---------- # MAGIC %run ./utils # COMMAND ---------- # -*- coding: utf-8 -*-
# MAGIC %md # MAGIC ### Delete Registered Models # COMMAND ---------- from mlflow.tracking.client import MlflowClient client = MlflowClient() modelName = "Titanic-Model__" + userName models = client.search_model_versions("name='{}'".format(modelName)) # loop over registered models for i in range(len(models)): try: # set model stage to Archive client.transition_model_version_stage(name=modelName, version=models[i].version, stage='Archived') except: pass # delete version of model client.delete_model_version(modelName, models[i].version) # delete model client.delete_registered_model(modelName) # COMMAND ---------- # MAGIC %md-sandbox # MAGIC © 2020 Databricks, Inc. All rights reserved.<br/> # MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="http://www.apache.org/">Apache Software Foundation</a>.<br/> # MAGIC <br/> # MAGIC <a href="https://databricks.com/privacy-policy">Privacy Policy</a> | <a href="https://databricks.com/terms-of-use">Terms of Use</a> | <a href="http://help.databricks.com/">Support</a>