예제 #1
0
model = lr.fit(training)

print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))

# 파이프라인
pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr])
samples2 = df9.randomSplit([0.7, 0.3])
training2 = samples2[0]
test2 = samples2[1]
예제 #2
0
                                          labelCol="price",
                                          metricName="rmse")

rmse = round(regressionEvaluator.evaluate(predDF), 2)
print(f"RMSE is {rmse}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## R2
# MAGIC
# MAGIC ![](https://files.training.databricks.com/images/r2d2.jpg) How is our R2 doing?

# COMMAND ----------

r2 = round(regressionEvaluator.setMetricName("r2").evaluate(predDF), 2)
print(f"R2 is {r2}")

# COMMAND ----------

pipelinePath = "/tmp/sf-airbnb/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Loading models
# MAGIC
# MAGIC When you load in models, you need to know the type of model you are loading back in (was it a linear regression or logistic regression model?).
# MAGIC
# MAGIC For this reason, we recommend you always put your transformers/estimators into a Pipeline, so you can always load the generic PipelineModel back in.
# Fit the model
lr_model = lr.fit(ad_df)


# In[6]:
# Prediction
pred = lr_model.transform(ad_df)
pred.show(5)


# In[7]:
# Module evaluation

from pyspark.ml.evaluation import RegressionEvaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

# In[8]:
def modelsummary(model, param_names):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients), model.intercept)
    Summary=model.summary
    param_names.append('intercept')

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:14.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:12.3f}'.format(Summary.tValues[i]),\
                                    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \
                                    "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features")

output = assembler.transform(trans_df3)


#Split the training & test data 
(trainingData, testData) = output.randomSplit([0.7, 0.3])

from pyspark.ml.evaluation import BinaryClassificationEvaluator
binaryEvaluator=BinaryClassificationEvaluator(labelCol="Y",rawPredictionCol="rawPrediction")
binaryEvaluator.setMetricName("areaUnderROC")

from pyspark.ml.evaluation import RegressionEvaluator
evaluatorRegression=RegressionEvaluator(labelCol="Y",predictionCol="prediction")
evaluatorRegression.setMetricName("rmse")

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='Y',maxIter=10, regParam=0.03, elasticNetParam=0.8)
model = lr.fit(trainingData)

print(model.summary.areaUnderROC)

prediction=model.transform(trainingData)
areaTraining=binaryEvaluator.evaluate(prediction)
print("Area Under ROC using Logistics Regression on training data =" + str(areaTraining))
predictionTest=model.transform(testData)
areaTest=binaryEvaluator.evaluate(predictionTest)
print("Area Under ROC using Logistics Regression on test data =" + str(areaTest))
rmseLR = evaluatorRegression.evaluate(predictionTest)
def train_model(df_orig, maxDepth, numTrees, stringIndexer, vecAssembler, train_on):
  from pyspark.sql.functions import col, to_date
  import mlflow
  import mlflow.spark
  import pandas as pd
  import uuid
  from pyspark.ml import Pipeline
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  from pyspark.ml.regression import RandomForestRegressor
  from pyspark.ml.evaluation import RegressionEvaluator
  from pyspark.sql.functions import lit
  from mlflow.tracking import MlflowClient
  
  
# The following dataframe contains the airport and the training dates range. They are used for training and testing a dataset in the training dates range.
# This is where we measure the performance from.

  df = None
  if train_on == "ARR":
    df = (df_orig.filter(df_orig.DEST == airport_code)
          .filter(col("SCHEDULED_ARR_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date))))
  
  
#     display(df_orig)
  elif train_on == "DEP":
  
    cols_to_drop_if_departure = ["TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_DELAY", "DIVERTED", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME"]
    df = (df_orig.filter(df_orig.ORIGIN == airport_code)
          .filter(col("SCHEDULED_DEP_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date)))
         .drop(*cols_to_drop_if_departure))
#     display(df.take(2))

#   print("DF = ")
#   display(df)
  (trainDF, testDF) = df.randomSplit([0.8,0.2], seed=42)
  
  with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}_arr_del".format(airport_code)) as run:
    rf = None
    if train_on == "ARR":
        rf = RandomForestRegressor(featuresCol = "features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    elif train_on == "DEP":
      rf = RandomForestRegressor(featuresCol = "features", labelCol="DEP_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
#     print(train_on)
#     display(trainDF)
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    # it is at this point where the pipeline "modifies" the training dataset and vectorizes it
    mlflow.spark.log_model(pipelineModel,
                           "{0}_rfr_{1}".format(airport_code, train_on))
    
    tags = {"training_start_date": training_start_date, "training_end_date": training_end_date}
    mlflow.set_tags(tags)

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = None
    if train_on == "ARR":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="ARR_DELAY")
    elif train_on == "DEP":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="DEP_DELAY")
      
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    
  client = MlflowClient()
  runs = client.search_runs(run.info.experiment_id,
                          order_by=["attributes.start_time desc"], 
                          max_results=1)
  runID = runs[0].info.run_uuid
  model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on)
  model_uri = "runs:/{run_id}/{code}_rfr_{arr_del}".format(run_id=runID, code = airport_code, arr_del = train_on)
  print("model_name = ", model_name, "model_uri = ", model_uri)
  model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
  print("REGISTERED")
예제 #6
0
print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight",
                                predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))

# 파이프라인
pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr])
samples2 = df9.randomSplit([0.7, 0.3])
training2 = samples2[0]
test2 = samples2[1]
# COMMAND ----------

# MAGIC %md
# MAGIC ##Evaluate
# MAGIC 
# MAGIC Next, we'll use `RegressionEvaluator` to assess the results. The default regression metric is RMSE.
# MAGIC 
# MAGIC For more information see:
# MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a>
# MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a>

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator().setLabelCol("cnt")

rmse = evaluator.evaluate(predictionsDF)
r2 = evaluator.setMetricName("r2").evaluate(predictionsDF) 
mse = evaluator.setMetricName("mse").evaluate(predictionsDF) 
mae = evaluator.setMetricName("mae").evaluate(predictionsDF) 

print("Test RMSE = %f" % rmse)
print("R^2 = %f" % r2)
print("MSE = %f" % mse)
print("MAE = %f" % mae)

# COMMAND ----------


예제 #8
0
# Import the `RegressionEvaluator` class from the `pyspark.ml.evaluation` module:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an instance of the `RegressionEvaluator` class:
evaluator = RegressionEvaluator(predictionCol="prediction",
                                labelCol="duration",
                                metricName="r2")

# Call the `explainParams` method to see other metrics:
print(evaluator.explainParams())

# Use the `evaluate` method to compute the metric on the `predictions` DataFrame:
evaluator.evaluate(predictions)

# Use the `setMetricName` method to change the metric:
evaluator.setMetricName("rmse").evaluate(predictions)

# **Note:** You can also use the `evaluate` method of the `LinearRegressionModel` class.

# ## Plot the linear regression model


def plot_lr_model():
    pdf = predictions.sample(withReplacement=False, fraction=0.1,
                             seed=34512).toPandas()
    plt.scatter("distance", "duration", data=pdf)
    plt.plot("distance", "prediction", color="black", data=pdf)
    plt.xlabel("Distance (m)")
    plt.ylabel("Duration (s)")
    plt.title("Linear Regression Model")