def test_linear_regression(self): lr = LinearRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/lr" lr.save(lr_path) lr2 = LinearRegression.load(lr_path) self.assertEqual(lr2.uid, lr2.maxIter.parent, "Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LinearRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_linear_regression(self): lr = LinearRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/lr" lr.save(lr_path) lr2 = LinearRegression.load(lr_path) self.assertEqual(lr2.uid, lr2.maxIter.parent, "Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LinearRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(df) test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 # True abs(model.coefficients[0] - 1.0) < 0.001 # True abs(model.intercept - 0.0) < 0.001 # True test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) abs(model.transform(test1).head().prediction - 1.0) < 0.001 # True lr.setParams("vector") # Traceback (most recent call last): # ... # TypeError: Method setParams forces keyword arguments. temp_path = "./" lr_path = temp_path + "/lr" lr.save(lr_path) lr2 = LinearRegression.load(lr_path) lr2.getMaxIter() # 5 model_path = temp_path + "/lr_model" model.save(model_path) model2 = LinearRegressionModel.load(model_path) model.coefficients[0] == model2.coefficients[0] # True model.intercept == model2.intercept # True model.numFeatures
# Initialize `lr` lr = LinearRegression(labelCol="label", maxIter=100, regParam=0.3, elasticNetParam=0.8) # Fit the data to the model linearModel = lr.fit(train_data) #Lets run this on our test dataset predicted = linearModel.transform(test_data) # Extract the predictions and the "known" correct labels predictions = predicted.select("prediction").rdd.map(lambda x: x[0]) labels = predicted.select("label").rdd.map(lambda x: x[0]) # Zip `predictions` and `labels` into a list predictionAndLabel = predictions.zip(labels).collect() # Print out first 5 instances of `predictionAndLabel` predictionAndLabel[:5] #This model can then be saved easily lr.save("/home/hduser/lrm_model.model") # We can save the model using below command sameModel = LogisticRegressionModel.load("/home/hduser/lrm_model.model") linearModel.summary.rootMeanSquaredError
#training the model regressor = LinearRegression(featuresCol='features', labelCol='Close') regressor = regressor.fit(train_data) #Finding coefficients print(regressor.coefficients) #finding intercept print(regressor.intercept) pred_results = regressor.evaluate(test_data) print(pred_results.predictions.show()) from pyspark.ml.evaluation import RegressionEvaluator #Finding coefficient of determination and rsme values try: # training Summary trainingSummary = regressor.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) except: print(" Model Test have a Problem") #saving the model regressor.save("StockPricepred_Model") print("Succesfully Saved") #import pickle #Pkl_Filename = "Regressor_Model" #with open(Pkl_Filename, 'wb') as f: # pickle.dump(regressor, f)
lm_transform = lm_model_fit.transform(testDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression testing Mean Squared Error = " + str(MSE)) res = results.collect() predsAndLabels = sc.parallelize([i.asDict().values() for i in res]) metrics = RegressionMetrics(predsAndLabels) print metrics.meanSquaredError print metrics.rootMeanSquaredError print metrics.r2 print metrics.explainedVariance lm_model.save(sc, "LinerRegressionModel") # LASSO lasso_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=1.0, elasticNetParam=0.0, tol=1e-6) lasso_model_fit = lasso_model.fit(trainDf) lasso_transform = lasso_model_fit.transform(trainDf) #change to a test model lasso_results = lasso_transform.select(lasso_transform['prediction'], lasso_transform['label']) lasso_MSE = lasso_results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("LASSO training Mean Squared Error = " + str(lasso_MSE)) lasso_transform = lasso_model_fit.transform(testDf) #change to a test model lasso_results = lasso_transform.select(lasso_transform['prediction'], lasso_transform['label']) lasso_MSE = lasso_results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
lm_transform = lm_model_fit.transform(testDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression testing Mean Squared Error = " + str(MSE)) res = results.collect() predsAndLabels = sc.parallelize([i.asDict().values() for i in res]) metrics = RegressionMetrics(predsAndLabels) print metrics.meanSquaredError print metrics.rootMeanSquaredError print metrics.r2 print metrics.explainedVariance lm_model.save(sc, "LinerRegressionModel") # LASSO lasso_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=1.0, elasticNetParam=0.0, tol=1e-6) lasso_model_fit = lasso_model.fit(trainDf) lasso_transform = lasso_model_fit.transform(trainDf) #change to a test model lasso_results = lasso_transform.select(lasso_transform['prediction'], lasso_transform['label']) lasso_MSE = lasso_results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("LASSO training Mean Squared Error = " + str(lasso_MSE)) lasso_transform = lasso_model_fit.transform(testDf) #change to a test model lasso_results = lasso_transform.select(lasso_transform['prediction'], lasso_transform['label']) lasso_MSE = lasso_results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()