Exemplo n.º 1
0
model = RandomForestRegressor(numTrees=100)
model = model.fit(train_data)

# # model evaluation

# In[ ]:

model.featureImportances

# In[ ]:

from pyspark.ml.evaluation import RegressionEvaluator

# In[ ]:

test_results = model.transform(test_data)

# In[ ]:

evaluator = RegressionEvaluator()
print('RMSE')
evaluator.evaluate(test_results)

# In[ ]:

print('R_sqr')
evaluator.evaluate(test_results, {evaluator.metricName: "r2"})

# In[ ]:

print('MAE')
Exemplo n.º 2
0
#Definimos el algoritmo del modelo (random forest)
model_regresion = RandomForestRegressor(labelCol="label",
                                        featuresCol="features",
                                        maxDepth=11,
                                        maxBins=64,
                                        numTrees=10)

# Fit the model
model_regresion = model_regresion.fit(train_LP)

#Save the model
# model_multiclase.save("dbfs:/dataset/modelo_multiclase_RF")

# Make predictions.
predictions = model_regresion.transform(evaluation_LP)

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# COMMAND ----------

#Generamos un vector con la columna array features

ignore = ["label"]
assembler = VectorAssembler(
    inputCols=[x for x in test.columns if x not in ignore],
    outputCol='features')
Exemplo n.º 3
0
df_testing = df_testing.dropna()

#assembler
assembler = VectorAssembler(inputCols=['Latitude', 'Longitude', 'Depth'],
                            outputCol='features')

model = RandomForestRegressor(featuresCol='features', labelCol='Magnitude')

#pipeline
pipeline = Pipeline(stages=[assembler, model])

#train_model
model = pipeline.fit(df_training)

#make prediction
pred = model.transform(df_testing)

#evaluate
evaluator = RegressionEvaluator(labelCol='Magnitude',
                                predictionCol='prediction',
                                metricName='rmse')
rmse = evaluator.evaluate(pred)

#create the prediction dataset
df_pred_results = pred['Latitude', 'Longitude', 'prediction']

#rename prediction column
df_pred_results = df_pred_results.withColumnRenamed('prediction',
                                                    'Pred_Magnitude')

#add more column to df_pred_results
Exemplo n.º 4
0
from pyspark.ml.regression import LinearRegression
lin_reg = LinearRegression(labelCol='total')
lr_model = lin_reg.fit(df_train)
print(lr_model.coefficients, '\n', lr_model.intercept)
train_prediction = lr_model.evaluate(df_train)
print(train_prediction.r2, train_prediction.meanAbsoluteError)

test_prediction = lr_model.evaluate(df_test)
print(test_prediction.r2, test_prediction.meanAbsoluteError)

test_prediction.predictions.show(3)

from pyspark.ml.regression import RandomForestRegressor
rf_model = RandomForestRegressor(featuresCol='features', 
                                 labelCol='total', numTrees=100).fit(df_train)
predictions = rf_model.transform(df_test)
predictions.show()
rf_model.featureImportances

from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="total", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rf_model.stages[1]
print(rf_model)  # summary only