model = lr.fit(training) print("결정계수(R2):%d" % model.summary.r2) d13 = model.transform(test) d13.cache() d13.select("weight", "predic_weight").show(5, False) evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight") # root mean squared error rmse = evaluator.evaluate(d13) # mean squared error mse = evaluator.setMetricName("mse").evaluate(d13) # R2 metric r2 = evaluator.setMetricName("r2").evaluate(d13) # mean absolute error mae = evaluator.setMetricName("mae").evaluate(d13) print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae)) # 파이프라인 pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr]) samples2 = df9.randomSplit([0.7, 0.3]) training2 = samples2[0] test2 = samples2[1]
labelCol="price", metricName="rmse") rmse = round(regressionEvaluator.evaluate(predDF), 2) print(f"RMSE is {rmse}") # COMMAND ---------- # MAGIC %md # MAGIC ## R2 # MAGIC # MAGIC ![](https://files.training.databricks.com/images/r2d2.jpg) How is our R2 doing? # COMMAND ---------- r2 = round(regressionEvaluator.setMetricName("r2").evaluate(predDF), 2) print(f"R2 is {r2}") # COMMAND ---------- pipelinePath = "/tmp/sf-airbnb/lr-pipeline-model" pipelineModel.write().overwrite().save(pipelinePath) # COMMAND ---------- # MAGIC %md # MAGIC ## Loading models # MAGIC # MAGIC When you load in models, you need to know the type of model you are loading back in (was it a linear regression or logistic regression model?). # MAGIC # MAGIC For this reason, we recommend you always put your transformers/estimators into a Pipeline, so you can always load the generic PipelineModel back in.
# Fit the model lr_model = lr.fit(ad_df) # In[6]: # Prediction pred = lr_model.transform(ad_df) pred.show(5) # In[7]: # Module evaluation from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label') evaluator.setMetricName('r2').evaluate(pred) # In[8]: def modelsummary(model, param_names): import numpy as np print ("Note: the last rows are the information for Intercept") print ("##","-------------------------------------------------") print ("##"," Estimate | Std.Error | t Values | P-value") coef = np.append(list(model.coefficients), model.intercept) Summary=model.summary param_names.append('intercept') for i in range(len(Summary.pValues)): print ("##",'{:10.6f}'.format(coef[i]),\ '{:14.6f}'.format(Summary.coefficientStandardErrors[i]),\ '{:12.3f}'.format(Summary.tValues[i]),\
"PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \ "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features") output = assembler.transform(trans_df3) #Split the training & test data (trainingData, testData) = output.randomSplit([0.7, 0.3]) from pyspark.ml.evaluation import BinaryClassificationEvaluator binaryEvaluator=BinaryClassificationEvaluator(labelCol="Y",rawPredictionCol="rawPrediction") binaryEvaluator.setMetricName("areaUnderROC") from pyspark.ml.evaluation import RegressionEvaluator evaluatorRegression=RegressionEvaluator(labelCol="Y",predictionCol="prediction") evaluatorRegression.setMetricName("rmse") from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol='Y',maxIter=10, regParam=0.03, elasticNetParam=0.8) model = lr.fit(trainingData) print(model.summary.areaUnderROC) prediction=model.transform(trainingData) areaTraining=binaryEvaluator.evaluate(prediction) print("Area Under ROC using Logistics Regression on training data =" + str(areaTraining)) predictionTest=model.transform(testData) areaTest=binaryEvaluator.evaluate(predictionTest) print("Area Under ROC using Logistics Regression on test data =" + str(areaTest)) rmseLR = evaluatorRegression.evaluate(predictionTest)
def train_model(df_orig, maxDepth, numTrees, stringIndexer, vecAssembler, train_on): from pyspark.sql.functions import col, to_date import mlflow import mlflow.spark import pandas as pd import uuid from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql.functions import lit from mlflow.tracking import MlflowClient # The following dataframe contains the airport and the training dates range. They are used for training and testing a dataset in the training dates range. # This is where we measure the performance from. df = None if train_on == "ARR": df = (df_orig.filter(df_orig.DEST == airport_code) .filter(col("SCHEDULED_ARR_TIME"). between(pd.to_datetime(training_start_date), pd.to_datetime(training_end_date)))) # display(df_orig) elif train_on == "DEP": cols_to_drop_if_departure = ["TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_DELAY", "DIVERTED", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME"] df = (df_orig.filter(df_orig.ORIGIN == airport_code) .filter(col("SCHEDULED_DEP_TIME"). between(pd.to_datetime(training_start_date), pd.to_datetime(training_end_date))) .drop(*cols_to_drop_if_departure)) # display(df.take(2)) # print("DF = ") # display(df) (trainDF, testDF) = df.randomSplit([0.8,0.2], seed=42) with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}_arr_del".format(airport_code)) as run: rf = None if train_on == "ARR": rf = RandomForestRegressor(featuresCol = "features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees) elif train_on == "DEP": rf = RandomForestRegressor(featuresCol = "features", labelCol="DEP_DELAY", maxDepth=maxDepth, numTrees=numTrees) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) mlflow.log_param("num_trees", rf.getNumTrees()) mlflow.log_param("max_depth", rf.getMaxDepth()) # print(train_on) # display(trainDF) # Log model pipelineModel = pipeline.fit(trainDF) # it is at this point where the pipeline "modifies" the training dataset and vectorizes it mlflow.spark.log_model(pipelineModel, "{0}_rfr_{1}".format(airport_code, train_on)) tags = {"training_start_date": training_start_date, "training_end_date": training_end_date} mlflow.set_tags(tags) # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = None if train_on == "ARR": regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY") elif train_on == "DEP": regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="DEP_DELAY") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) client = MlflowClient() runs = client.search_runs(run.info.experiment_id, order_by=["attributes.start_time desc"], max_results=1) runID = runs[0].info.run_uuid model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on) model_uri = "runs:/{run_id}/{code}_rfr_{arr_del}".format(run_id=runID, code = airport_code, arr_del = train_on) print("model_name = ", model_name, "model_uri = ", model_uri) model_details = mlflow.register_model(model_uri=model_uri, name=model_name) print("REGISTERED")
print("결정계수(R2):%d" % model.summary.r2) d13 = model.transform(test) d13.cache() d13.select("weight", "predic_weight").show(5, False) evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight") # root mean squared error rmse = evaluator.evaluate(d13) # mean squared error mse = evaluator.setMetricName("mse").evaluate(d13) # R2 metric r2 = evaluator.setMetricName("r2").evaluate(d13) # mean absolute error mae = evaluator.setMetricName("mae").evaluate(d13) print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae)) # 파이프라인 pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr]) samples2 = df9.randomSplit([0.7, 0.3]) training2 = samples2[0] test2 = samples2[1]
# COMMAND ---------- # MAGIC %md # MAGIC ##Evaluate # MAGIC # MAGIC Next, we'll use `RegressionEvaluator` to assess the results. The default regression metric is RMSE. # MAGIC # MAGIC For more information see: # MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a> # MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a> # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator().setLabelCol("cnt") rmse = evaluator.evaluate(predictionsDF) r2 = evaluator.setMetricName("r2").evaluate(predictionsDF) mse = evaluator.setMetricName("mse").evaluate(predictionsDF) mae = evaluator.setMetricName("mae").evaluate(predictionsDF) print("Test RMSE = %f" % rmse) print("R^2 = %f" % r2) print("MSE = %f" % mse) print("MAE = %f" % mae) # COMMAND ----------
# Import the `RegressionEvaluator` class from the `pyspark.ml.evaluation` module: from pyspark.ml.evaluation import RegressionEvaluator # Create an instance of the `RegressionEvaluator` class: evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="duration", metricName="r2") # Call the `explainParams` method to see other metrics: print(evaluator.explainParams()) # Use the `evaluate` method to compute the metric on the `predictions` DataFrame: evaluator.evaluate(predictions) # Use the `setMetricName` method to change the metric: evaluator.setMetricName("rmse").evaluate(predictions) # **Note:** You can also use the `evaluate` method of the `LinearRegressionModel` class. # ## Plot the linear regression model def plot_lr_model(): pdf = predictions.sample(withReplacement=False, fraction=0.1, seed=34512).toPandas() plt.scatter("distance", "duration", data=pdf) plt.plot("distance", "prediction", color="black", data=pdf) plt.xlabel("Distance (m)") plt.ylabel("Duration (s)") plt.title("Linear Regression Model")