Пример #1
0
    field for (field, dataType) in trainDF.dtypes
    if ((dataType == "double") & (field != "price"))
]
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
rf = RandomForestRegressor(labelCol="price",
                           maxBins=40,
                           maxDepth=5,
                           numTrees=100,
                           seed=42)
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

with mlflow.start_run(run_name="random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")
    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                              labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(
        list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)),
        columns=["feature", "importance"]).sort_values(by="importance",
Пример #2
0
#os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000/'
mlflow.set_tracking_uri('http://localhost:5000/')
print("mlflow tracking_uri: " + mlflow.tracking.get_tracking_uri())

# try:
#     mlflow.create_experiment(name="Advertising with Spark3",
#                              artifact_location="hdfs://localhost:9000/user/train/mlflow")
# except Exception as e:
#     print(e)

mlflow.set_experiment("Advertising with Spark3")

with mlflow.start_run(run_name="spark-advertising-random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", estimator.getNumTrees())
    mlflow.log_param("max_depth", estimator.getMaxDepth())
    mlflow.log_param("max_bins", estimator.getMaxBins())

    # Log model
    pipelineModel = pipeline_obj.fit(train_df)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(test_df)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                              labelCol="Sales")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})

    # Log the sklearn model and register as version 1
Пример #3
0
def train_model(df_orig, maxDepth, numTrees):
    from pyspark.sql.functions import col, to_date
    import mlflow
    import mlflow.spark
    import pandas as pd
    import uuid
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, VectorAssembler
    from pyspark.ml.regression import RandomForestRegressor
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.sql.functions import lit
    from mlflow.tracking import MlflowClient

    # The following dataframe contains the destination airport and the training dates range. They are used for training and testing a dataset in the training dates range.
    # This is where we measure the performance from.
    df = (df_orig.filter(df_orig.DEST == airport_code).filter(
        col("SCHEDULED_DEP_TIME").between(pd.to_datetime(training_start_date),
                                          pd.to_datetime(training_end_date))))
    #   the following dataframe contains only the inference date and the destination airport. It is used for predicting the actual values
    df_inference = (df_orig.filter(df_orig.DEST == airport_code).filter(
        to_date(col("SCHEDULED_DEP_TIME")) == inference_date))
    dest = airport_code
    (trainDF, testDF) = df.randomSplit([0.8, 0.2], seed=42)

    stringIndexer, vecAssembler = load_assemblers(df_orig)
    with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}".
                          format(dest)) as run:
        rf = RandomForestRegressor(featuresCol="features",
                                   labelCol="ARR_DELAY",
                                   maxDepth=maxDepth,
                                   numTrees=numTrees)
        pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])
        mlflow.log_param("num_trees", rf.getNumTrees())
        mlflow.log_param("max_depth", rf.getMaxDepth())
        # Log model
        pipelineModel = pipeline.fit(trainDF)
        # it is at this point where the pipeline "modifies" the training dataset and vectorizes it
        mlflow.spark.log_model(pipelineModel, "{0}_rfr".format(airport_code))

        tags = {
            "training_start_date": training_start_date,
            "training_end_date": training_end_date
        }
        mlflow.set_tags(tags)

        # Log metrics: RMSE and R2
        predDF = pipelineModel.transform(testDF)
        regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                                  labelCol="ARR_DELAY")
        rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
        r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
        mlflow.log_metrics({"rmse": rmse, "r2": r2})

    client = MlflowClient()
    runs = client.search_runs(run.info.experiment_id,
                              order_by=["attributes.start_time desc"],
                              max_results=1)
    runID = runs[0].info.run_uuid
    model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code,
                                              training_start_date,
                                              training_end_date,
                                              inference_date)
    model_uri = "runs:/{run_id}/{code}_rfr".format(run_id=runID, code=dest)
    model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
    #   model_details
    # move this latest version of the model to the Staging if there is a production version
    # else register it as the production version

    model_version = dict(
        client.search_model_versions(f"name='{model_name}'")[0])['version']
    model_stage = "Production"
    for mv in client.search_model_versions(f"name='{model_name}'"):
        if dict(mv)['current_stage'] == 'Staging':
            # Archive the currently staged model
            client.transition_model_version_stage(name=dict(mv)['name'],
                                                  version=dict(mv)['version'],
                                                  stage="Archived")
            model_stage = "Staging"
        elif dict(mv)['current_stage'] == 'Production':
            model_stage = "Staging"
    # move the model to the appropriate stage.
    client.transition_model_version_stage(name=model_name,
                                          version=model_version,
                                          stage=model_stage)

    predicted_inference_DF = pipelineModel.transform(df_inference)
    #   the idea now is to return the predicted delay for each model version and save these things in a table such as the one in notebook 06 RandomForest with Time & Weather.
    return predicted_inference_DF
def train_model(df_orig, maxDepth, numTrees, stringIndexer, vecAssembler, train_on):
  from pyspark.sql.functions import col, to_date
  import mlflow
  import mlflow.spark
  import pandas as pd
  import uuid
  from pyspark.ml import Pipeline
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  from pyspark.ml.regression import RandomForestRegressor
  from pyspark.ml.evaluation import RegressionEvaluator
  from pyspark.sql.functions import lit
  from mlflow.tracking import MlflowClient
  
  
# The following dataframe contains the airport and the training dates range. They are used for training and testing a dataset in the training dates range.
# This is where we measure the performance from.

  df = None
  if train_on == "ARR":
    df = (df_orig.filter(df_orig.DEST == airport_code)
          .filter(col("SCHEDULED_ARR_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date))))
  
  
#     display(df_orig)
  elif train_on == "DEP":
  
    cols_to_drop_if_departure = ["TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_DELAY", "DIVERTED", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME"]
    df = (df_orig.filter(df_orig.ORIGIN == airport_code)
          .filter(col("SCHEDULED_DEP_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date)))
         .drop(*cols_to_drop_if_departure))
#     display(df.take(2))

#   print("DF = ")
#   display(df)
  (trainDF, testDF) = df.randomSplit([0.8,0.2], seed=42)
  
  with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}_arr_del".format(airport_code)) as run:
    rf = None
    if train_on == "ARR":
        rf = RandomForestRegressor(featuresCol = "features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    elif train_on == "DEP":
      rf = RandomForestRegressor(featuresCol = "features", labelCol="DEP_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
#     print(train_on)
#     display(trainDF)
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    # it is at this point where the pipeline "modifies" the training dataset and vectorizes it
    mlflow.spark.log_model(pipelineModel,
                           "{0}_rfr_{1}".format(airport_code, train_on))
    
    tags = {"training_start_date": training_start_date, "training_end_date": training_end_date}
    mlflow.set_tags(tags)

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = None
    if train_on == "ARR":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="ARR_DELAY")
    elif train_on == "DEP":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="DEP_DELAY")
      
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    
  client = MlflowClient()
  runs = client.search_runs(run.info.experiment_id,
                          order_by=["attributes.start_time desc"], 
                          max_results=1)
  runID = runs[0].info.run_uuid
  model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on)
  model_uri = "runs:/{run_id}/{code}_rfr_{arr_del}".format(run_id=runID, code = airport_code, arr_del = train_on)
  print("model_name = ", model_name, "model_uri = ", model_uri)
  model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
  print("REGISTERED")