def train_and_pred(train, test_data, tech_only=False): # train the linear regression model lr_model = LinearRegression(featuresCol='scaledFeatures', labelCol=TARGET, maxIter=300, regParam=1, elasticNetParam=1).fit(train) print('Coefficients: {}'.format(str(lr_model.coefficients))) print('Intercept: {}'.format(str(lr_model.intercept))) # summarize the training trainingSummary = lr_model.summary print('Training r2 = {}'.format(float(trainingSummary.r2))) print('Training RMSE = {}\n'.format( float(trainingSummary.rootMeanSquaredError))) predictions_dict = dict() for company in test_data: test_company_data = test_data[company] lr_predictions = lr_model.transform(test_company_data) # Model Evaluation lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol=TARGET, metricName='r2') test_r2 = lr_evaluator.evaluate(lr_predictions) print('{}, testing r2 = {}'.format(company.upper(), test_r2)) test_result = lr_model.evaluate(test_company_data) print('{}, testing RMSE = {}\n'.format( company.upper(), test_result.rootMeanSquaredError)) new_df = lr_predictions.drop('scaledFeatures').withColumn( 'Instrument', lit(company)) new_df = new_df.withColumn('Error_Pct', error_pct_udf(array(TARGET, 'prediction'))) new_df = new_df.withColumn('Tech_Only_Pred', lit(tech_only)) predictions_dict[company] = new_df.toPandas().reset_index().rename( columns={'index': 'row_num'}) return predictions_dict
# In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret. # In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data. # You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation. # The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test. # Instructions # 100 XP # Fit a linear regression model to the training data. # Calculate the RMSE on the testing data. # Look at the model coefficients. # Get the count of coefficients equal to 0. from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Fit Lasso model (α = 1) to training data regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train) # Calculate the RMSE on testing data rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test)) print("The test RMSE is", rmse) # Look at the model coefficients coeffs = regression.coefficients print(coeffs) # Number of zero coefficients zero_coeff = sum([beta == 0 for beta in regression.coefficients]) print("Number of ceofficients equal to 0:", zero_coeff)
valid_finalized_data.show() # 80/20 split train / test train_data, test_data = finalized_data.randomSplit([0.8, 0.2]) regressor = LinearRegression(featuresCol='Attributes', labelCol=dataset.columns[11]) #Train mdoel with training split regressor = regressor.fit(train_data) pred = regressor.evaluate(test_data) #Predict the model pred.predictions.show() predictions = regressor.transform(valid_finalized_data) predictions.show() dataset.groupby("quality").count().show() # ################################################################################################################ # export the trained model and create a zip file for ease of download import shutil from pyspark.ml.regression import LinearRegressionModel regressor.write().overwrite().save("cs643") path_drv = shutil.make_archive("cs643", format='zip', base_dir="cs643") shutil.unpack_archive( "cs643.zip", "test", format='zip',
# dow (departure day of week, one-hot encoded, 7 levels) and # mon (departure month, one-hot encoded, 12 levels). # These have been assembled into the features column, which is a sparse representation of 32 columns (remember one-hot encoding produces a number of columns which is one fewer than the number of levels). # The data are available as flights, randomly split into flights_train and flights_test. The object predictions is also available. # Instructions # 100 XP # Fit a linear regression model to the training data. # Generate predictions for the testing data. # Calculate the RMSE on the testing data. # Look at the model coefficients. Are any of them zero? from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Fit linear regression model to training data regression = LinearRegression(labelCol='duration').fit(flights_train) # Make predictions on testing data predictions = regression.transform(flights_test) # Calculate the RMSE on testing data rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions) print("The test RMSE is", rmse) # Look at the model coefficients coeffs = regression.coefficients print(coeffs)
assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'], outputCol='features') cars = assembler.transform(cars) kars = cars.select('consumption', 'features') print(kars.toPandas().sample(12)) # Split the data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) regression = LinearRegression(labelCol='consumption').fit(kars_train) # Create predictions for the testing data and take a look at the predictions predictions = regression.transform(kars_test) print("\nStandard Linear Regression") #print("\nStandard Linear Regression\nSample") #print(predictions.toPandas().sample(12)) # Print the coefficients and RMSE for linear regression trainingSummary = regression.summary print("Coefficients: %s" % str(regression.coefficients)) print("RMSE: %f" % trainingSummary.rootMeanSquaredError) # Ridge regression ridge = LinearRegression(labelCol='consumption', elasticNetParam=0, regParam=0.1).fit(kars_train) # Create predictions for the testing data and take a look at the predictions predictions = ridge.transform(kars_test) print("\nRidge Regression") #print("\nRidge Regression\nSample")
# Create an instance of the one hot encoder onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy']) # Apply the one hot encoder to the flights data onehot = onehot.fit(flights) flights_onehot = onehot.transform(flights) # Check the results flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show() # Regression from pyspark.ml.regression import LinearRegression regression = LinearRegression(labelCol='consumption') regression = regression.fit(cars_train) predictions = regression.transform(cars_test) # Calculate RMSE from pyspark.ml.evaluation import RegressionEvaluator # Find RMSE RegressionEvaluator(labelCol='consumption').evaluate(prediction) # Other metrics: mae, r2, mse # Examine intercept print(regression.intercept) # Examine Coefficients print(regression.coefficients) from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Create a regression object and train on training data
# Consolidate predictor columns flights_assembled = assembler.transform(flites) # Check the resulting column flites = flights_assembled.select('features', 'duration') # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) #print(flights_train.toPandas().shape, flights_test.toPandas().shape) # Create a lasso regression object and train on training data lasso = LinearRegression(labelCol="duration", elasticNetParam=1, regParam=1).fit(flights_train) # Create predictions for the testing data and take a look at the predictions predictions = lasso.transform(flights_test) #predictions.select('duration', 'prediction').show(truncate=False) print("\nLasso Regression") print(predictions.toPandas().sample(12)) # Calculate the RMSE print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions)) # Print the coefficients and intercept for linear regression print("\nCoefficients: %s" % str(lasso.coefficients)) print("Intercept: %s" % str(lasso.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lasso.summary #print("numIterations: %d" % trainingSummary.totalIterations) #print("objectiveHistory: %s\n" % str(trainingSummary.objectiveHistory))
accuracy = evalVal.evaluate(predictions) ##print("f1 Test Error = %g" % (1.0 - accuracy)) transformed_data = model.transform(val) transformed_data = transformed_data.withColumn("prediction", func.round("prediction")) ##print(evalVal.getMetricName(), 'accuracy :', evalVal.evaluate(transformed_data)) # In[123]: ####### Linear Regression Accuarcy and f1 ############ # Create evaluator evaluatorMulti = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction") # Make predicitons predictionAndTarget = regressor.transform(val).select("quality", "prediction") predictionAndTarget = predictionAndTarget.withColumn("prediction", func.round("prediction")) # Get metrics acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"}) f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"}) # In[124]: print("##### Testdataset Accuracy #####") print("Accuracy :" , acc * 100 , "%") print("f1 Score :" , f1)
# Consolidate predictor columns kars_assembled = assembler.transform(kars) # Check the resulting column kars = kars_assembled.select('features', 'consumption') kars.distinct().show(8, truncate=False) # Split the data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) # Fit a Logistic Regression model to the training data regression = LinearRegression(labelCol='consumption') regression = regression.fit(kars_train) # Make predictions on the testing data prediction = regression.transform(kars_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy("consumption", 'prediction').count().show(8) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(regression.coefficients)) print("Intercept: %s" % str(regression.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = regression.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show(8) print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
dataset = spark.read.csv( "C:/Users/bansi/spark-3.0.1-bin-hadoop2.7/Stock Data Google.csv", inferSchema=True, header=True) featureassembler = VectorAssembler(inputCols=["Open", "High", "Low", "Volume"], outputCol="Independent Features") output = featureassembler.transform(dataset) finalized_data = output.select("Independent Features", "Close") train_data, test_data = finalized_data.randomSplit([0.75, 0.25]) regressor = LinearRegression(featuresCol='Independent Features', labelCol='Close') regressor = regressor.fit(train_data) predictions = regressor.transform(test_data) #predictions.show() lr_evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="Close",metricName="r2") test_result = regressor.evaluate(test_data) print("R Squared (R2) = %g" % lr_evaluator.evaluate(predictions)) print("Root Mean Squared Error (RMSE) = %g" % test_result.rootMeanSquaredError) print("Mean Absolute Error = %g" % test_result.meanAbsoluteError) print("Mean Squared Error = %g" % test_result.meanSquaredError) actual = test_data.toPandas()['Close'].values.tolist() predicted = predictions.toPandas()['prediction'].values.tolist()
plt.ylabel('Beta Coefficients') plt.show() # In[39]: regressor.intercept # In[40]: trainingSummary = regressor.summary print("numIterations: %d" % trainingSummary.totalIterations) # In[46]: test_data = featureassembler.transform(df_test) rest = regressor.transform(test_data) df = rest.toPandas() rest.select("type", "type_index", "prediction").show(6) # In[42]: from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='type_index') # In[44]: print("The area under ROC for test set is {}".format(evaluator.evaluate(rest))) # In[ ]:
The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test. Instructions 100 XP Fit a linear regression model to the training data. Calculate the RMSE on the testing data. Look at the model coefficients. Get the count of coefficients equal to 0. ''' SOLUTION from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Fit Lasso model (α = 1) to training data regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train) # Calculate the RMSE on testing data rmse = RegressionEvaluator(labelCol='duration').evaluate( regression.transform(flights_test)) print("The test RMSE is", rmse) # Look at the model coefficients coeffs = regression.coefficients print(coeffs) # Number of zero coefficients zero_coeff = sum([beta == 0 for beta in regression.coefficients]) print("Number of ceofficients equal to 0:", zero_coeff)
assembler = VectorAssembler( inputCols=[x for x in train.columns if x not in ignore], outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (regresion logistica) model_regresion = LinearRegression(maxIter=50, regParam=0.05, elasticNetParam=0.05) # Fit the model model_regresion = model_regresion.fit(train_LP) # Make predictions. predictions = model_regresion.transform(evaluation_LP) # Fit the model # lrModel = lr.fit(training) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model_regresion.coefficients)) print("Intercept: %s" % str(model_regresion.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = model_regresion.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
# Consolidate predictor columns flights_assembled = assembler.transform(flites) # Check the resulting column flites = flights_assembled.select('features', 'duration') # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) #print(flights_train.toPandas().shape, flights_test.toPandas().shape) # Create a ridge regression object and train on training data ridge = LinearRegression(labelCol="duration", elasticNetParam=0, regParam=0.1).fit(flights_train) # Create predictions for the testing data and take a look at the predictions predictions = ridge.transform(flights_test) #predictions.select('duration', 'prediction').show(truncate=False) print("\nRidge Regression") print(predictions.toPandas().sample(12)) # Calculate the RMSE print("\nRMSE", RegressionEvaluator(labelCol="duration").evaluate(predictions)) # Print the coefficients and intercept for linear regression print("\nCoefficients: %s" % str(ridge.coefficients)) print("Intercept: %s" % str(ridge.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = ridge.summary #print("numIterations: %d" % trainingSummary.totalIterations) #print("objectiveHistory: %s\n" % str(trainingSummary.objectiveHistory))
# Plot on a Google Map using Bokeh (include Maps.py for the plot method) # execfile('Maps.py') # plot(lats, lons, count) l = post.filter(lambda (k, x): k == 'W1J 7NT').map( lambda (k, (c1, c2)): [c1, c2]).reduce(lambda x, y: x + y) p10 = valid.filter(lambda (k, c, (y, m, d), p, s): y == 2010) dist = p10.map(lambda (k, (c0, c1), d, p, s): (p, distance(c0, c1, l[0], l[1]))) vectors = dist.map(lambda (x, y): Vectors.dense([x, y])) print(Statistics.corr(vectors, method='spearman')) parsedData = dist.map(lambda (p, d): LabeledPoint(float(p), Vectors.dense(d))) model = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8).fit(parsedData.toDF()) Beta = model.coefficients intercept = model.intercept x = dist.map(lambda (p, d): Vectors.dense(d)) y = dist.map(lambda (p, d): p) sd_y = y.sampleStdev() sd_x = x.sampleStdev() r = Beta / (sd_y / sd_x) predict = model.transform(parsedData.toDF()) evaluator = RegressionEvaluator(metricName='rmse') RMSE = evaluator.evaluate(predict) #226861.44751570973