field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price")) ] assemblerInputs = indexOutputCols + numericCols vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5, numTrees=100, seed=42) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) with mlflow.start_run(run_name="random-forest") as run: # Log params: num_trees and max_depth mlflow.log_param("num_trees", rf.getNumTrees()) mlflow.log_param("max_depth", rf.getMaxDepth()) # Log model pipelineModel = pipeline.fit(trainDF) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) # Log artifact: feature importance scores rfModel = pipelineModel.stages[-1] pandasDF = (pd.DataFrame( list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)), columns=["feature", "importance"]).sort_values(by="importance",
#os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000/' mlflow.set_tracking_uri('http://localhost:5000/') print("mlflow tracking_uri: " + mlflow.tracking.get_tracking_uri()) # try: # mlflow.create_experiment(name="Advertising with Spark3", # artifact_location="hdfs://localhost:9000/user/train/mlflow") # except Exception as e: # print(e) mlflow.set_experiment("Advertising with Spark3") with mlflow.start_run(run_name="spark-advertising-random-forest") as run: # Log params: num_trees and max_depth mlflow.log_param("num_trees", estimator.getNumTrees()) mlflow.log_param("max_depth", estimator.getMaxDepth()) mlflow.log_param("max_bins", estimator.getMaxBins()) # Log model pipelineModel = pipeline_obj.fit(train_df) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(test_df) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Sales") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) # Log the sklearn model and register as version 1
def train_model(df_orig, maxDepth, numTrees): from pyspark.sql.functions import col, to_date import mlflow import mlflow.spark import pandas as pd import uuid from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql.functions import lit from mlflow.tracking import MlflowClient # The following dataframe contains the destination airport and the training dates range. They are used for training and testing a dataset in the training dates range. # This is where we measure the performance from. df = (df_orig.filter(df_orig.DEST == airport_code).filter( col("SCHEDULED_DEP_TIME").between(pd.to_datetime(training_start_date), pd.to_datetime(training_end_date)))) # the following dataframe contains only the inference date and the destination airport. It is used for predicting the actual values df_inference = (df_orig.filter(df_orig.DEST == airport_code).filter( to_date(col("SCHEDULED_DEP_TIME")) == inference_date)) dest = airport_code (trainDF, testDF) = df.randomSplit([0.8, 0.2], seed=42) stringIndexer, vecAssembler = load_assemblers(df_orig) with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}". format(dest)) as run: rf = RandomForestRegressor(featuresCol="features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) mlflow.log_param("num_trees", rf.getNumTrees()) mlflow.log_param("max_depth", rf.getMaxDepth()) # Log model pipelineModel = pipeline.fit(trainDF) # it is at this point where the pipeline "modifies" the training dataset and vectorizes it mlflow.spark.log_model(pipelineModel, "{0}_rfr".format(airport_code)) tags = { "training_start_date": training_start_date, "training_end_date": training_end_date } mlflow.set_tags(tags) # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) client = MlflowClient() runs = client.search_runs(run.info.experiment_id, order_by=["attributes.start_time desc"], max_results=1) runID = runs[0].info.run_uuid model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, inference_date) model_uri = "runs:/{run_id}/{code}_rfr".format(run_id=runID, code=dest) model_details = mlflow.register_model(model_uri=model_uri, name=model_name) # model_details # move this latest version of the model to the Staging if there is a production version # else register it as the production version model_version = dict( client.search_model_versions(f"name='{model_name}'")[0])['version'] model_stage = "Production" for mv in client.search_model_versions(f"name='{model_name}'"): if dict(mv)['current_stage'] == 'Staging': # Archive the currently staged model client.transition_model_version_stage(name=dict(mv)['name'], version=dict(mv)['version'], stage="Archived") model_stage = "Staging" elif dict(mv)['current_stage'] == 'Production': model_stage = "Staging" # move the model to the appropriate stage. client.transition_model_version_stage(name=model_name, version=model_version, stage=model_stage) predicted_inference_DF = pipelineModel.transform(df_inference) # the idea now is to return the predicted delay for each model version and save these things in a table such as the one in notebook 06 RandomForest with Time & Weather. return predicted_inference_DF
def train_model(df_orig, maxDepth, numTrees, stringIndexer, vecAssembler, train_on): from pyspark.sql.functions import col, to_date import mlflow import mlflow.spark import pandas as pd import uuid from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql.functions import lit from mlflow.tracking import MlflowClient # The following dataframe contains the airport and the training dates range. They are used for training and testing a dataset in the training dates range. # This is where we measure the performance from. df = None if train_on == "ARR": df = (df_orig.filter(df_orig.DEST == airport_code) .filter(col("SCHEDULED_ARR_TIME"). between(pd.to_datetime(training_start_date), pd.to_datetime(training_end_date)))) # display(df_orig) elif train_on == "DEP": cols_to_drop_if_departure = ["TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_DELAY", "DIVERTED", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME"] df = (df_orig.filter(df_orig.ORIGIN == airport_code) .filter(col("SCHEDULED_DEP_TIME"). between(pd.to_datetime(training_start_date), pd.to_datetime(training_end_date))) .drop(*cols_to_drop_if_departure)) # display(df.take(2)) # print("DF = ") # display(df) (trainDF, testDF) = df.randomSplit([0.8,0.2], seed=42) with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}_arr_del".format(airport_code)) as run: rf = None if train_on == "ARR": rf = RandomForestRegressor(featuresCol = "features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees) elif train_on == "DEP": rf = RandomForestRegressor(featuresCol = "features", labelCol="DEP_DELAY", maxDepth=maxDepth, numTrees=numTrees) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) mlflow.log_param("num_trees", rf.getNumTrees()) mlflow.log_param("max_depth", rf.getMaxDepth()) # print(train_on) # display(trainDF) # Log model pipelineModel = pipeline.fit(trainDF) # it is at this point where the pipeline "modifies" the training dataset and vectorizes it mlflow.spark.log_model(pipelineModel, "{0}_rfr_{1}".format(airport_code, train_on)) tags = {"training_start_date": training_start_date, "training_end_date": training_end_date} mlflow.set_tags(tags) # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = None if train_on == "ARR": regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY") elif train_on == "DEP": regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="DEP_DELAY") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) client = MlflowClient() runs = client.search_runs(run.info.experiment_id, order_by=["attributes.start_time desc"], max_results=1) runID = runs[0].info.run_uuid model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on) model_uri = "runs:/{run_id}/{code}_rfr_{arr_del}".format(run_id=runID, code = airport_code, arr_del = train_on) print("model_name = ", model_name, "model_uri = ", model_uri) model_details = mlflow.register_model(model_uri=model_uri, name=model_name) print("REGISTERED")