Python RegressionEvaluator.setMetricName 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.evaluation

클래스/타입: RegressionEvaluator

메소드/함수: setMetricName

hotexamples.com에서의 예제들: 8

Python RegressionEvaluator.setMetricName - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.evaluation.RegressionEvaluator.setMetricName에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

RegressionEvaluator(30)

setMetricName(7)

setParams(4)

explainParams(2)

setLabelCol(2)

setPredictionCol(2)

copy(1)

evaluator(1)

getLabelCol(1)

getPredictionCol(1)

예제 #1

파일 보기

파일: regression_sample.py 프로젝트: oopchoi/spark

model = lr.fit(training)

print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))

# 파이프라인
pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr])
samples2 = df9.randomSplit([0.7, 0.3])
training2 = samples2[0]
test2 = samples2[1]

예제 #2

파일 보기

                                          labelCol="price",
                                          metricName="rmse")

rmse = round(regressionEvaluator.evaluate(predDF), 2)
print(f"RMSE is {rmse}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## R2
# MAGIC
# MAGIC ![](https://files.training.databricks.com/images/r2d2.jpg) How is our R2 doing?

# COMMAND ----------

r2 = round(regressionEvaluator.setMetricName("r2").evaluate(predDF), 2)
print(f"R2 is {r2}")

# COMMAND ----------

pipelinePath = "/tmp/sf-airbnb/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Loading models
# MAGIC
# MAGIC When you load in models, you need to know the type of model you are loading back in (was it a linear regression or logistic regression model?).
# MAGIC
# MAGIC For this reason, we recommend you always put your transformers/estimators into a Pipeline, so you can always load the generic PipelineModel back in.

예제 #3

파일 보기

파일: L5-linear-regression-pyspark.py 프로젝트: ghadeer19/InternetOfThings

# Fit the model
lr_model = lr.fit(ad_df)


# In[6]:
# Prediction
pred = lr_model.transform(ad_df)
pred.show(5)


# In[7]:
# Module evaluation

from pyspark.ml.evaluation import RegressionEvaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

# In[8]:
def modelsummary(model, param_names):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients), model.intercept)
    Summary=model.summary
    param_names.append('intercept')

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:14.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:12.3f}'.format(Summary.tValues[i]),\

예제 #4

파일 보기

파일: 09-SampleLRModel.py 프로젝트: mansim07/CreditCardRiskAnalytics

                                    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \
                                    "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features")

output = assembler.transform(trans_df3)


#Split the training & test data 
(trainingData, testData) = output.randomSplit([0.7, 0.3])

from pyspark.ml.evaluation import BinaryClassificationEvaluator
binaryEvaluator=BinaryClassificationEvaluator(labelCol="Y",rawPredictionCol="rawPrediction")
binaryEvaluator.setMetricName("areaUnderROC")

from pyspark.ml.evaluation import RegressionEvaluator
evaluatorRegression=RegressionEvaluator(labelCol="Y",predictionCol="prediction")
evaluatorRegression.setMetricName("rmse")

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='Y',maxIter=10, regParam=0.03, elasticNetParam=0.8)
model = lr.fit(trainingData)

print(model.summary.areaUnderROC)

prediction=model.transform(trainingData)
areaTraining=binaryEvaluator.evaluate(prediction)
print("Area Under ROC using Logistics Regression on training data =" + str(areaTraining))
predictionTest=model.transform(testData)
areaTest=binaryEvaluator.evaluate(predictionTest)
print("Area Under ROC using Logistics Regression on test data =" + str(areaTest))
rmseLR = evaluatorRegression.evaluate(predictionTest)

예제 #5

파일 보기

파일: MLOps_Lifecycle.py 프로젝트: RyanMaciel/DSC202FinalProject

def train_model(df_orig, maxDepth, numTrees, stringIndexer, vecAssembler, train_on):
  from pyspark.sql.functions import col, to_date
  import mlflow
  import mlflow.spark
  import pandas as pd
  import uuid
  from pyspark.ml import Pipeline
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  from pyspark.ml.regression import RandomForestRegressor
  from pyspark.ml.evaluation import RegressionEvaluator
  from pyspark.sql.functions import lit
  from mlflow.tracking import MlflowClient
  
  
# The following dataframe contains the airport and the training dates range. They are used for training and testing a dataset in the training dates range.
# This is where we measure the performance from.

  df = None
  if train_on == "ARR":
    df = (df_orig.filter(df_orig.DEST == airport_code)
          .filter(col("SCHEDULED_ARR_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date))))
  
  
#     display(df_orig)
  elif train_on == "DEP":
  
    cols_to_drop_if_departure = ["TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_DELAY", "DIVERTED", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "AIR_TIME"]
    df = (df_orig.filter(df_orig.ORIGIN == airport_code)
          .filter(col("SCHEDULED_DEP_TIME").
                  between(pd.to_datetime(training_start_date), 
                          pd.to_datetime(training_end_date)))
         .drop(*cols_to_drop_if_departure))
#     display(df.take(2))

#   print("DF = ")
#   display(df)
  (trainDF, testDF) = df.randomSplit([0.8,0.2], seed=42)
  
  with mlflow.start_run(run_name="flights-randomforest-with-regressors-{0}_arr_del".format(airport_code)) as run:
    rf = None
    if train_on == "ARR":
        rf = RandomForestRegressor(featuresCol = "features", labelCol="ARR_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    elif train_on == "DEP":
      rf = RandomForestRegressor(featuresCol = "features", labelCol="DEP_DELAY", maxDepth=maxDepth, numTrees=numTrees)
    pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
#     print(train_on)
#     display(trainDF)
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    # it is at this point where the pipeline "modifies" the training dataset and vectorizes it
    mlflow.spark.log_model(pipelineModel,
                           "{0}_rfr_{1}".format(airport_code, train_on))
    
    tags = {"training_start_date": training_start_date, "training_end_date": training_end_date}
    mlflow.set_tags(tags)

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = None
    if train_on == "ARR":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="ARR_DELAY")
    elif train_on == "DEP":
      regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                              labelCol="DEP_DELAY")
      
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    
  client = MlflowClient()
  runs = client.search_runs(run.info.experiment_id,
                          order_by=["attributes.start_time desc"], 
                          max_results=1)
  runID = runs[0].info.run_uuid
  model_name = "rfr_{0}_{1}_{2}_{3}".format(airport_code, training_start_date, training_end_date, train_on)
  model_uri = "runs:/{run_id}/{code}_rfr_{arr_del}".format(run_id=runID, code = airport_code, arr_del = train_on)
  print("model_name = ", model_name, "model_uri = ", model_uri)
  model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
  print("REGISTERED")

예제 #6

파일 보기

print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight",
                                predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))

# 파이프라인
pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr])
samples2 = df9.randomSplit([0.7, 0.3])
training2 = samples2[0]
test2 = samples2[1]

예제 #7

파일 보기

파일: CapitalBikeRentalNotebook.py 프로젝트: sushma-git-2020/pyspark-Projects

# COMMAND ----------

# MAGIC %md
# MAGIC ##Evaluate
# MAGIC 
# MAGIC Next, we'll use `RegressionEvaluator` to assess the results. The default regression metric is RMSE.
# MAGIC 
# MAGIC For more information see:
# MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a>
# MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator" target="_blank">RegressionEvaluator</a>

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator().setLabelCol("cnt")

rmse = evaluator.evaluate(predictionsDF)
r2 = evaluator.setMetricName("r2").evaluate(predictionsDF) 
mse = evaluator.setMetricName("mse").evaluate(predictionsDF) 
mae = evaluator.setMetricName("mae").evaluate(predictionsDF) 

print("Test RMSE = %f" % rmse)
print("R^2 = %f" % r2)
print("MSE = %f" % mse)
print("MAE = %f" % mae)

# COMMAND ----------

예제 #8

파일 보기

# Import the `RegressionEvaluator` class from the `pyspark.ml.evaluation` module:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an instance of the `RegressionEvaluator` class:
evaluator = RegressionEvaluator(predictionCol="prediction",
                                labelCol="duration",
                                metricName="r2")

# Call the `explainParams` method to see other metrics:
print(evaluator.explainParams())

# Use the `evaluate` method to compute the metric on the `predictions` DataFrame:
evaluator.evaluate(predictions)

# Use the `setMetricName` method to change the metric:
evaluator.setMetricName("rmse").evaluate(predictions)

# **Note:** You can also use the `evaluate` method of the `LinearRegressionModel` class.

# ## Plot the linear regression model


def plot_lr_model():
    pdf = predictions.sample(withReplacement=False, fraction=0.1,
                             seed=34512).toPandas()
    plt.scatter("distance", "duration", data=pdf)
    plt.plot("distance", "prediction", color="black", data=pdf)
    plt.xlabel("Distance (m)")
    plt.ylabel("Duration (s)")
    plt.title("Linear Regression Model")