Exemplo n.º 1
0
testSetDF = split85DF.cache()
trainSetDF = split15DF.cache()

###########################
# model building in spark #
###########################
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

# model constructor 
lr = LinearRegression()

# understand model parameters 
lr.explainParams()		#individually call fnc on each param to set, like the following
lr.setPredictionCol("Prediction_PE")\	#rename the prediction col 
  .setLabelCol("PE")\		#col in df
  .setMaxIter(100)\
  .setRegParam(0.15)

###########################
# create a pipeline
# - pipeline contains a series of stages in sequential execution 
# - each stage either an estimator or a transformer 
# - pipeline.fit() may equal to one of the following:
#	* estimator.fit()
#	* transformer.transform()
# - the fitted model = pipelineModel 
###########################

lrPipeline = Pipeline()
Exemplo n.º 2
0
df_featured = df_featured.select( col("dem").alias("label"), col("ts"),col("id"), col("hour"),col("weekday"),col("pro_lag1"),col("pre_lag1"), col("pro"),col("pre")).filter(col("pro_lag1")>0)
df_featured.printSchema()



training_seti = df_featured.select(col("pro_lag1"), col("pre_lag1"),col("hour"), col("ts"), col("label"))

vectorizer = VectorAssembler()
vectorizer.setInputCols(["pro_lag1", "pre_lag1", "hour"])
vectorizer.setOutputCol("features")

# Let's initialize our linear regression learner
lr = LinearRegression()

lr.setPredictionCol("prediction")\
  .setMaxIter(100)\
  .setRegParam(0.1)

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer,lr])

lrModel = lrPipeline.fit(training_seti)

predicted_df = lrModel.transform(training_seti)
# display(predicted_df)

test_seti = df_featured.select(col("pro").alias("pro_lag1"), col("pre").alias("pre_lag1"),col("hour"), col("ts"))
predicted_test_df = lrModel.transform(test_seti)
Exemplo n.º 3
0
# Converts the list of columns into a single vector column
vectorizer = VectorAssembler()
vectorizer.setInputCols(['Atmospheric_Temperature', 'Vacuum_Speed', 'Atmospheric_Pressure', 'Relative_Humidity'])
vectorizer.setOutputCol('features')

# splitting the dataset into training and test datasets in 80% - 20% ratio
seed = 1800009193
(testSetDF, trainSetDF) = raw_data_df.randomSplit([0.2, 0.8], seed=seed)
testSetDF.cache()
trainSetDF.cache()

# Create a Linear Regression Model
lr = LinearRegression()
# print(lr.explainParams())
lr.setPredictionCol('Predicted_PE').setLabelCol('Power_Output').setMaxIter(100).setRegParam(0.1)

# Create a ML Pipeline and set the stages
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr])

# Train the model with training dataset
lrModel = lrPipeline.fit(trainSetDF)

# Get the intercept and co-efficients of the equation
intercept = lrModel.stages[1].intercept
weights = lrModel.stages[1].coefficients

# Get list of column names except output
features = [col for col in trainSetDF.columns if col != "Power_Output"]
Exemplo n.º 4
0
# ***** LINEAR REGRESSION MODEL ****

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

# Let's initialize our linear regression learner
lr = LinearRegression()

# We use explain params to dump the parameters we can use
print(lr.explainParams())

# Now we set the parameters for the method
lr.setPredictionCol("Predicted_PE")\
  .setLabelCol("PE")\
  .setMaxIter(100)\
  .setRegParam(0.1)


# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

lrPipeline.setStages([vectorizer, lr])

# Let's first train on the entire dataset to see what we get
lrModel = lrPipeline.fit(trainingSetDF)

# The intercept is as follows:
intercept = lrModel.stages[1].intercept

# The coefficents (i.e., weights) are as follows:

# See which are the parameters
print(lr.explainParams())


# Two parameters are not optional:
# - The name of the label column to "PE" (i.e. which are the known values to
# learn)
# - The name of the prediction column to "Predicted_PE" (i.e. where the
# predictions values should be stored)

# In[18]:


lr.setPredictionCol("Predicted_PE")  .setLabelCol("PE")


# We will also configure two parameters, which a re customary to the linear
# regression
# - the maximum number of iterations to 100
# - the regularization parameter to 0.1

# In[19]:


lr.setMaxIter(100)  .setRegParam(0.1)


# ## Part 8 Create a pipeline
# 
Exemplo n.º 6
0
# COMMAND ----------

# ***** LINEAR REGRESSION MODEL ****

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

# Let's initialize our linear regression learner
lr = LinearRegression()

# COMMAND ----------

# Now we set the parameters for the method
lr.setPredictionCol("predicted_meter_reading")\
  .setLabelCol("meter_reading")\
  .setMaxIter(100)\
  .setRegParam(0.15)


# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

lrPipeline.setStages([vectorizer, lr])

# Let's first train on the entire dataset to see what we get
lrModel = lrPipeline.fit(trainingSetDF)


# COMMAND ----------

# The intercept is as follows: