示例#1
0
def fitVar(p_lag, dataFrame, model_path=None):
    # print(p_lag)
    current_lag = p_lag

    # df_len_ori: number of variables in model, K
    x_list = dataFrame.columns
    # print('x_list',x_list)
    df_len_ori = len(x_list)
    # print("df_len_ori is ")
    # print(df_len_ori)
    dataFrame_names = dataFrame.columns
    # dataFrame = dataFrame.withColumn("id", monotonically_increasing_id())
    # dataFrame.printSchema()
    # dataFrame.show(10)
    # Here, VAR model regression_type is "const" same to R VAR library, and the default in Python VAR library
    # w = Window().partitionBy().orderBy(col("id"))
    w = Window().partitionBy().orderBy(col("Timestamp"))
    df_len = len(dataFrame.columns)
    ys_lagged_list = ["const"]
    # Making sure first column is not considered for forecasting
    for i in range(1, p_lag + 1):
        for j in range(0, df_len):
            # making sure index column is not considered as feature column
            if x_list[j] != 'TimeStamp':
                ys_lagged_list.append("%st-%s" % (x_list[j], str(i)))
                # print('2',ys_lagged_list)
                dataFrame = dataFrame.withColumn("%st-%s" % (x_list[j], str(i)), lag(dataFrame[j], i, 0).over(w))
                # print('3')
    # print("Showing DataFrame")
    # dataFrame.show(5)
    # print('ys_lagged_list',ys_lagged_list)

    # add "const" column of value 1 to get intercept when fitting the regression model
    dataFrame = dataFrame.withColumn("const", lit(1))
    dataFrame = dataFrame.withColumn("const", lag("const", p_lag, 0).over(w))
    dataFrame = dataFrame.withColumn("rid", monotonically_increasing_id())
    dataFrame = dataFrame.filter(dataFrame.rid >= p_lag)
    # dataFrame.show(5)
    #     build ys_lagged dataframe, will be used in F-test
    ys_lagged = dataFrame.select(ys_lagged_list)
    ys_lagged_len = ys_lagged.count()
    # print('ye dikhai lagged value')
    # ys_lagged.show(10)

    #     dataFrame = dataFrame.drop('id')
    dataFrame = dataFrame.drop('rid')
    dataFrame = dataFrame.drop('const')
    input_feature_name = dataFrame.schema.names

    # input_feature_name.remove("id")
    for x_name in x_list:
        input_feature_name.remove('{}'.format(x_name))

    # assemble the vector for MLlib linear regression
    assembler_for_lag = VectorAssembler(
        inputCols=input_feature_name,
        outputCol="features")

    # a = {}
    # b = {}
    lrModels = []
    # Arjun added this for evaluation
    evaluator = RegressionEvaluator()
    models = {}
    predictions = {}
    # print('Iterating the features')
    for select_y in x_list:
        if select_y != 'TimeStamp':
            model_key = '{}'.format(select_y)
            # ML model will be trained for each micro batch if existing model is not provided
            # print('model path',model_path+ '{}'.format(select_y))
            if model_path is None:
                lr = LinearRegression(featuresCol='features', labelCol='{}'.format(select_y), maxIter=1000,
                                      fitIntercept=True)
                pipeline = Pipeline(stages=[assembler_for_lag, lr])
                model_val = pipeline.fit(dataFrame)
            else:
                # try:
                model_val = PipelineModel.load(model_path + '{}'.format(select_y))
                # model_val.transform(dataFrame).show()
            # Arjun Added this code for the performance evaluation of the model
            evaluator.setLabelCol("{}".format(select_y))
            # Root Mean Square Error
            evaluator.setMetricName('rmse')
            evaluator.setPredictionCol("prediction")
            rmse = evaluator.evaluate(model_val.transform(dataFrame))
            # Mean Square Error
            evaluator.setMetricName('mse')
            mse = evaluator.evaluate(model_val.transform(dataFrame))
            # Mean Absolute Error
            evaluator.setMetricName('mae')
            mae = evaluator.evaluate(model_val.transform(dataFrame))

            models[model_key] = model_val
            predictions[model_key] = model_val.transform(dataFrame)
            lrModels.append(model_val)
    # trying to return it as model
    # return lrModels,predictions

    df_RT_Temp = predictions['RT_Temp']
    df_Nu_Temp = predictions['Nu_Temp']
    df_final = (
        df_RT_Temp.alias('dr').join(df_Nu_Temp.alias('dn'), on=df_RT_Temp['TimeStamp'] == df_Nu_Temp['TimeStamp'],
                                    how='inner').selectExpr('dr.TimeStamp as TS',
                                                            'dr.RT_Temp',
                                                            'dr.prediction as RT_Temp_Predict',
                                                            'dn.Nu_Temp as Nu_Temp',
                                                            'dn.prediction as NU_Temp_Predict')
    )
    df_final = df_final.withColumn("MAE_Score", lit(mae))
    df_final = df_final.withColumn("MSE_Score", lit(mse))
    df_final = df_final.withColumn("RMSE_Score", lit(rmse))
    return df_final
# MAGIC Now that we have a model with all of the data let's try to make a better model by tuning over several parameters.
# MAGIC
# MAGIC Documentation Available here: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.tuning

# COMMAND ----------

regParam = [i / 100.0 for i in range(1, 11)]
regParam

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

regEval = RegressionEvaluator(predictionCol="Predicted_PE")
regEval.setLabelCol("PE")\
  .setMetricName("rmse")

regParam = [i / 100.0 for i in range(1, 11)]

grid = ParamGridBuilder().addGrid(lr.regParam, regParam).build()

crossval = CrossValidator(estimator=lrPipeline,
                          estimatorParamMaps=grid,
                          evaluator=regEval,
                          numFolds=5)

cvModel = crossval.fit(trainingSet)

# COMMAND ----------

# MAGIC %md Now that we have tuned let's see what we got for tuning parameters and what our RMSE was versus our intial model