def train(self, rdd): """ This ignores the optimizer parameter since it makes config difficult for Linear Regression. :return: Trained model to be passed to test. """ options = self.options if options.loss == "l2": if options.reg_type in ["none", "l1", "l2"]: return LinearRegressionWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, miniBatchFraction=1.0, regParam=options.reg_param, regType=options.reg_type) elif options.reg_type == "elastic-net": # use spark.ml lr = MLLinearRegression(maxIter=options.num_iterations, regParam=options.reg_param, elasticNetParam=options.elastic_net_param) # TODO: Do not include time for conversion to DataFrame (but this currently matches # the Scala tests) df = rdd.toDF() lrModel = lr.fit(df) return LinearRegressionModel(lrModel.weights, lrModel.intercept) else: raise Exception("GLMRegressionTest cannot run with loss = %s, reg_type = %s" \ % (options.loss, options.reg_type)) else: raise Exception("GLMRegressionTest does not recognize loss: %s" % options.loss)
def test_java_object_gets_detached(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = 'Target Object ID does not exist for this gateway' self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) try: summary.__del__() except: pass with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString()
def test_linear_regression_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1) model = lr.fit(df) path = tempfile.mkdtemp() lr_path = path + "/lr-pmml" model.write().format("pmml").save(lr_path) pmml_text_list = self.sc.textFile(lr_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def test_linear_regression_with_huber_loss(self): data_path = "data/mllib/sample_linear_regression_data.txt" df = self.spark.read.format("libsvm").load(data_path) lir = LinearRegression(loss="huber", epsilon=2.0) model = lir.fit(df) expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537, 1.2612, -0.333, -0.5694, -0.6311, 0.6053] expectedIntercept = 0.1607 expectedScale = 9.758 self.assertTrue( np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3)) self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3)) self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
def test_linear_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertAlmostEqual(s.explainedVariance, 0.25, 2) self.assertAlmostEqual(s.meanAbsoluteError, 0.0) self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) self.assertAlmostEqual(s.r2adj, 1.0, 2) self.assertTrue(isinstance(s.residuals, DataFrame)) self.assertEqual(s.numInstances, 2) self.assertEqual(s.degreesOfFreedom, 1) devResiduals = s.devianceResiduals self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class LinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.regression import LinearRegression # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("LinearRegressionWithElasticNet").getOrCreate() # $example on$ # Load training data training = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(training) # Print the coefficients and intercept for linear regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # $example off$ spark.stop()
df = data.toDF(colNames) # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame. # Perhaps you're importing data from a real database. Or you are using structured streaming # to get your data. # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train the model using our training data model = lir.fit(trainingDF) # Now see if we can predict values in our test data. # Generate predictions using our linear regression model for all features in our # test dataframe: fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect() # Print out the predicted and actual values for each point for prediction in predictionAndLabel:
df_vector = spark.createDataFrame( input_data, ["fare", "features"]) # Create a vector dataframe # Scale the Pclass values to make it more fit for analysis standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(df_vector) df_scaled = scaler.transform(df_vector) # Create train and test data for the regression model train_data, test_data = df_scaled.randomSplit([.8, .2], seed=1234) # Create a Linear Regression model lr = LinearRegression(labelCol="fare", maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(train_data) print('\n------------- Question 3 -------------') # Print some important statistics from the regression model print( 'Linear Regression model statistics for dependent Fare and independent Pclass:' ) print("Coefficient(s): %s" % str(model.coefficients)) print("Intercept: %s" % str(model.intercept)) print("RMSE: %f" % model.summary.rootMeanSquaredError) print("r2: %f" % model.summary.r2) print('\n') # Answer y = b + ax or fare = intercept + coefficient * pclass # round output to 2 decimals q3a = round(model.intercept + (model.coefficients[0] * 1), 2)
# Replace `df` with the new DataFrame Dataframe = spark.createDataFrame(input_data, ["label", "features"]) from pyspark.ml.feature import StandardScaler standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(Dataframe) scaled_df = scaler.transform(Dataframe) train_data, test_data = scaled_df.randomSplit([.8,.2],seed=1234) from pyspark.ml.regression import LinearRegression lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8) linearModel = lr.fit(train_data) Got error here predicted = linearModel.transform(test_data) predictions = predicted.select("prediction").rdd.map(lambda x: x[0]) labels = predicted.select("label").rdd.map(lambda x: x[0]) predictionAndLabel[:5] linearModel.coefficients linearModel.intercept # Get the RMSE linearModel.summary.rootMeanSquaredError #The RMSE measures how much error there is between two datasets comparing a predicted value and an observed or known value. #The smaller an RMSE value, the closer predicted and observed values are.
def ridgeRegression(self, dataset_add, feature_colm, label_colm, relation_list, relation,userId): try: dataset = spark.read.parquet(dataset_add) dataset.show() Rsqr_list = [] Rsqr_regPara = {} print(self.xt) # print(data_add) label = '' for val in label_colm: label = val #ETL part Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType" or str(x.dataType) == 'TimestampType' or str( x.dataType) == 'DateType' or str(x.dataType) == 'BooleanType' or str(x.dataType) == 'BinaryType'): for y in feature_colm: if x.name == y: dataset = dataset.withColumn(y, dataset[y].cast(StringType())) stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) categoryColmList = [] categoryColmListFinal = [] categoryColmListDict = {} countOfCategoricalColmList = [] for value in stringFeatures: categoryColm = value listValue = value listValue = [] categoryColm = dataset.groupby(value).count() countOfCategoricalColmList.append(categoryColm.count()) categoryColmJson = categoryColm.toJSON() for row in categoryColmJson.collect(): categoryColmSummary = json.loads(row) listValue.append(categoryColmSummary) categoryColmListDict[value] = listValue if not stringFeatures: maxCategories = 5 else: maxCategories = max(countOfCategoricalColmList) for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label, handleInvalid="skip").fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] encodedFeatures = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) featureAssembler = VectorAssembler(inputCols=indexed_features + numericalFeatures, outputCol='features', handleInvalid="skip") dataset = featureAssembler.transform(dataset) vectorIndexer = VectorIndexer(inputCol='features', outputCol='vectorIndexedFeatures', maxCategories=maxCategories, handleInvalid="skip").fit( dataset) dataset = vectorIndexer.transform(dataset) trainDataRatioTransformed = self.trainDataRatio testDataRatio = 1 - trainDataRatioTransformed train_data, test_data = dataset.randomSplit([trainDataRatioTransformed, testDataRatio], seed=40) ######################################################################33 for t in self.xt: lr1 = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=0, regParam=t) regressor1 = lr1.fit(train_data) print(t) print("coefficient : " + str(regressor1.coefficients)) reg_sum = regressor1.summary r2 = reg_sum.r2 Rsqr_list.append(r2) Rsqr_regPara[r2] = t print(r2) print(Rsqr_list) print(max(Rsqr_list)) maximum_rsqr = max(Rsqr_list) print(Rsqr_regPara) final_regPara = [] for key, val in Rsqr_regPara.items(): if (key == maximum_rsqr): print(val) final_regPara.append(val) for reg in final_regPara: lr_lasso = LinearRegression(featuresCol="vectorIndexedFeatures", labelCol=label, elasticNetParam=0, regParam=reg) regressor = lr_lasso.fit(train_data) training_summary = regressor.summary r2 = training_summary.r2 print(r2) print("coefficient : " + str(regressor.coefficients)) coefficient_t = str(regressor.coefficients) print("intercept : " + str(regressor.intercept)) intercept_t = str(regressor.intercept) prediction = regressor.evaluate(test_data) prediction_val = prediction.predictions prediction_val.show() prediction_val_pand = prediction_val.select(label, "prediction").toPandas() prediction_val_pand = prediction_val_pand.assign( residual_vall=prediction_val_pand[label] - prediction_val_pand["prediction"]) prediction_val_pand_residual = prediction_val_pand["residual_vall"] prediction_val_pand_label = prediction_val_pand[label] prediction_val_pand_predict = prediction_val_pand["prediction"] lr_prediction = regressor.transform(test_data) lr_prediction.groupBy(label, "prediction").count().show() lr_prediction_quantile = lr_prediction.select(label, "prediction") lr_prediction_onlypred = lr_prediction.select('prediction') # training_summary = regressor.summary print("numof_Iterations...%d\n" % training_summary.totalIterations) print("ObjectiveHistory...%s\n" % str(training_summary.objectiveHistory)) print("RMSE...%f\n" % training_summary.rootMeanSquaredError) RMSE = training_summary.rootMeanSquaredError print("MSE....%f\n" % training_summary.meanSquaredError) MSE = training_summary.meanSquaredError print("r**2(r-square)....::%f\n" % training_summary.r2) r_square = training_summary.r2 print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj) adjsted_r_square = training_summary.r2adj print("deviance residuals %s" % str(training_summary.devianceResiduals)) training_summary.residuals.show() residual_graph = training_summary.residuals residual_graph_pandas = residual_graph.toPandas() print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors)) coefficientStdError = str(training_summary.coefficientStandardErrors) print(" Tvalues :\n" + str(training_summary.tValues)) T_values = str(training_summary.tValues) tValuesList = training_summary.tValues print(" p values :\n" + str(training_summary.pValues)) P_values = str(training_summary.pValues) coefficientList = list(regressor.coefficients) #summaryData import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') print(coefficientList) coefficientListRounded = [] for value in coefficientList: coefficientListRounded.append(round(value, 4)) # print(coefficientListRounded) # print(intercept_t) interceptRounded = round(float(intercept_t), 4) # print(interceptRounded) # print(RMSE) RMSERounded = round(RMSE, 4) # print(RMSERounded) MSERounded = round(MSE, 4) rSquareRounded = round(r_square, 4) adjustedrSquareRounded = round(adjsted_r_square, 4) coefficientStdError = training_summary.coefficientStandardErrors coefficientStdErrorRounded = [] for value in coefficientStdError: coefficientStdErrorRounded.append(round(float(value), 4)) print(coefficientStdErrorRounded) tValuesListRounded = [] for value in tValuesList: tValuesListRounded.append(round(value, 4)) print(tValuesListRounded) pValuesListRounded = [] PValuesList = training_summary.pValues for value in PValuesList: pValuesListRounded.append(round(value, 4)) print(pValuesListRounded) # regression equation intercept_t = float(intercept_t) coefficientList = list(regressor.coefficients) equation = label, '=', interceptRounded, '+' for feature, coeff in zip(feature_colm, coefficientListRounded): coeffFeature = coeff, '*', feature, '+' equation += coeffFeature equation = equation[:-1] print(equation) equationAsList = list(equation) # significance value PValuesList = training_summary.pValues significanceObject = {} for pValue in pValuesListRounded: if (0 <= pValue < 0.001): significanceObject[pValue] = '***' if (0.001 <= pValue < 0.01): significanceObject[pValue] = '**' if (0.01 <= pValue < 0.05): significanceObject[pValue] = '*' if (0.05 <= pValue < 0.1): significanceObject[pValue] = '.' if (0.1 <= pValue < 1): significanceObject[pValue] = '-' print(significanceObject) # residual vs predicted value prediction_data = regressor.summary.predictions prediction_data.show() prediction_data.select(['prediction']).show() predicted = prediction_data.select(['prediction']) regressor.summary.residuals.show() residuals = regressor.summary.residuals pred_d = predicted.withColumn('row_index', f.monotonically_increasing_id()) res_d = residuals.withColumn('row_index', f.monotonically_increasing_id()) pred_residuals = pred_d.join(res_d, on=['row_index']).sort('row_index').drop('row_index') pred_residuals.show() QQPlot = 'QQPlot.parquet' locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/' # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67' QQPlotAddress = locationAddress + userId + QQPlot pred_residuals.write.parquet(QQPlotAddress, mode='overwrite') # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet', # mode='overwrite') #################################################################################3 # scale location plot from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev df_label = prediction_data.select(label, 'prediction', sqrt(ab(prediction_data[label])).alias("sqrt_label")) df_label.show() df_sqrt_label_index = df_label.withColumn('row_index', f.monotonically_increasing_id()) df_sqrt_label_index.show() res_d.show() sqrt_label_residual_join = df_sqrt_label_index.join(res_d, on=['row_index']).sort('row_index').drop( 'row_index') sqrt_label_residual_join.show() std_resid = sqrt_label_residual_join.select('sqrt_label', 'prediction', ( sqrt_label_residual_join['residuals'] / sqrt_label_residual_join['sqrt_label']).alias( 'std_res')) std_resid.show() sqrt_std_res = std_resid.select("std_res", 'prediction', sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid")) sqrt_std_res.show() sqrt_std_res_fitted = sqrt_std_res.select('prediction', 'sqrt_std_resid') scaleLocationPlot = 'scaleLocation.parquet' scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress, mode='overwrite') # sqrt_std_res_fitted.write.parquet( # 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet', # mode='overwrite') ########### #QQplot # QUANTILE from scipy.stats import norm import statistics import math res_d.show() sorted_res = res_d.sort('residuals') sorted_res.show() # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'), # meann(col('residuals')).alias('mean')) # stdev_ress.show() # mean_residual = stdev_ress.select(['mean']).toPandas() # l = mean_residual.values.tolist() # print(l) # stddev_residual = stdev_ress.select(['std_dev']).toPandas() # length of the sorted std residuals count = sorted_res.groupBy().count().toPandas() countList = count.values.tolist() tuple1 = () for k in countList: tuple1 = k for tu in tuple1: lengthResiduals = tu print(lengthResiduals) quantileList = [] for x in range(0, lengthResiduals): quantileList.append((x - 0.5) / (lengthResiduals)) print(quantileList) # Z-score on theoritical quantile zTheoriticalTrain = [] for x in quantileList: zTheoriticalTrain.append(norm.ppf(abs(x))) print(zTheoriticalTrain) sortedResidualPDF = sorted_res.select('residuals').toPandas() sortedResidualPDF = sortedResidualPDF['residuals'] stdevResidualTrain = statistics.stdev(sortedResidualPDF) meanResidualTrain = statistics.mean(sortedResidualPDF) zPracticalTrain = [] for x in sortedResidualPDF: zPracticalTrain.append((x - meanResidualTrain) / stdevResidualTrain) ########## target = dataset.select(label) pred = prediction_data.select(['prediction']) pred_d = pred.withColumn('row_index', f.monotonically_increasing_id()) target_d = target.withColumn('row_index', f.monotonically_increasing_id()) pred_target = pred_d.join(target_d, on=['row_index']).drop('row_index') pred_target.show() dataset.show() pred_target_data_update = dataset.join(pred_target, on=[label]) pred_target_data_update.show(100) ##########3 # table_response = { # # "Intercept": intercept_t, # "Coefficients": coefficient_t, # "RMSE": RMSE, # "MSE": MSE, # "R_square": r_square, # "Adj_R_square": adjsted_r_square, # "coefficientStdError": coefficientStdError, # "T_value": T_values, # "P_value": P_values # # } y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_label = lr_prediction_quantile.approxQuantile(label, x, 0.01) quantile_prediction = lr_prediction_quantile.approxQuantile("prediction", x, 0.01) Q_label_pred='' print(len(quantile_label)) length = len(quantile_label) for i in range(0,len(quantile_label)): Q_label_pred += str(quantile_label[i]) + 't' + str(quantile_prediction[i]) + 'n' import math fitted_residual = '' print(len(prediction_val_pand_residual)) length = len(prediction_val_pand_residual) for i in range(0, len(prediction_val_pand_residual)): fitted_residual += str(prediction_val_pand_predict[i]) + 't' + str(prediction_val_pand_residual[i]) + 'n' ## scale location graph data prediction_val_pand_residual prediction_val_pand_predict prediction_val_pand_residual_abs = prediction_val_pand_residual.abs() import math sqrt_residual = [] for x in prediction_val_pand_residual_abs: sqrt_residual.append(math.sqrt(x)) # print ("____________________ ",x) sqrt_residual # calculating std deviation import statistics print(statistics.stdev(prediction_val_pand_residual)) stdev_ = statistics.stdev(prediction_val_pand_residual) # calcuate stnd residuals std_res = [] for x in prediction_val_pand_residual: std_res.append(x / stdev_) print(std_res) # calculating the square root of std_res import math sqr_std_res = [] for x in std_res: sqr_std_res.append(math.sqrt(abs(x))) print(sqr_std_res) scale_predict_residual = '' for pre, res in zip(prediction_val_pand_predict, sqr_std_res): scale_predict_residual += str(pre) + 't' + str(res) + 'n' print(scale_predict_residual) # QUANTILE y = 0.1 x = [] for i in range(0, 90): x.append(y) y = round(y + 0.01, 2) quantile_std_res = spark.createDataFrame(std_res, FloatType()) quantile_std_res.show() quantile_std_res_t = quantile_std_res.approxQuantile('value', x, 0.01) print(quantile_std_res_t) print(x) # calculating the z_score from scipy.stats import norm ## sort the list sorted_std_res = sorted(std_res) mean = statistics.mean(sorted_std_res) stdev = statistics.stdev(sorted_std_res) # print(mean) quantile = [] n = len(std_res) print(n) for x in range(0,n): quantile.append((x-0.5) / (n)) print(quantile) # z_score theoratical z_theory = [] for x in quantile: z_theory.append(norm.ppf(abs(x))) # z score for real val z_pract = [] for x in sorted_std_res: z_pract.append((x-mean)/stdev) Q_label_pred = '' for quant,val in zip(z_theory,z_pract): Q_label_pred += str(quant) + 't' + str(val) + 'n' graph_response = { "Q_Q_plot": Q_label_pred, "residual_fitted": fitted_residual, "scale_location": scale_predict_residual } tableContent = \ { 'coefficientValuesKey': coefficientListRounded, 'tValuesKey': tValuesListRounded, 'pValuesKey': pValuesListRounded, 'significanceValuesKey': significanceObject, 'interceptValuesKey': interceptRounded, "RMSE": RMSERounded, "RSquare": rSquareRounded, "AdjRSquare": adjustedrSquareRounded, "CoefficientStdError": coefficientStdErrorRounded, 'equationKey': equation } json_response = { 'table_data': tableContent, 'graph_data' : graph_response } print(json_response) return (json_response) except Exception as e: print('exception is =' + str(e))
#VECTORIZE TRAIN DATA energi_nuclear_train = ssc.textFileStream("train_nuclear.txt") energi_nuclear_train_labeled = energi_nuclear_train.map(parse_train) energi_nuclear_train_labeled_DF = SQLContext.createDataFrame(energi_nuclear_train_labeled["label", "features"]) print(energi_nuclear_train_labeled_DF) #VECTORIZE TEST DATA energi_nuclear_test = ssc.textFileStream("test_nuclear.txt") energi_nuclear_test_labeled = energi_nuclear_test.map(parse_test) energi_nuclear_test_labeled_DF = SQLContext.createDataFrame(energi_nuclear_test_labeled["label", "features"]) print(energi_nuclear_test_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_nuclear_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_nuclear_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)
from pyspark.sql import SparkSession from pyspark.ml.regression import LinearRegression sample_test_data_path = 'test_input/linear_regression/sample_linear_regression_data.txt' spark = SparkSession.builder.appName('lrex').getOrCreate() all_data = spark.read.format('libsvm').load(sample_test_data_path) # Split the data into training and test training_data, test_data = all_data.randomSplit([0.7, 0.3]) # Initialize model lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction') # Fit the model lrModel = lr.fit(training_data) test_results = lrModel.evaluate(test_data) rms = test_results.rootMeanSquaredError print rms # Unlabelled data unlabelled_data = test_data.select('features') predictions = lrModel.transform(unlabelled_data) print predictions
from pyspark.sql.functions import udf # Linear regression model parameter values num_iters = 500 # iterations reg = 1e-1 # regParam alpha = .2 # elasticNetParam use_intercept = True # intercept # parsed_train_data_df = parsed_train_data_df.withColumn("Year", parsed_train_data_df["Year"].cast(DoubleType())) parsed_train_data_df = parsed_train_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year']))) parsed_train_data_df = sqlContext.createDataFrame(parsed_train_data_df,["features","label"]) parsed_train_data_df lin_reg = LinearRegression(maxIter = num_iters, regParam = reg, elasticNetParam = alpha, fitIntercept = use_intercept, labelCol = 'label', featuresCol = 'features') first_model = lin_reg.fit(parsed_train_data_df) %pyspark coeffs_LR1 = first_model.coefficients intercept_LR1 = first_model.intercept print coeffs_LR1, intercept_LR1 %pyspark parsed_val_data_df = parsed_val_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year']))) parsed_val_data_df = sqlContext.createDataFrame(parsed_val_data_df,["features","label"]) #parsed_val_data_df = parsed_val_data_df.withColumn("label", parsed_val_data_df["label"].cast(DoubleType())) val_pred_df = first_model.transform(parsed_val_data_df) rmse_val_LR1 = evaluator.evaluate(val_pred_df)
output.printSchema() output.head(1) final_data = output.select('features', 'Yearly Amount Spent') final_data.show() train, test = final_data.randomSplit([0.7,0.3]) train.describe().show() test.describe().show() ## Create model with train data set ## And evaluate our model with test data set lr = LinearRegression(labelCol = "Yearly Amount Spent") linear_regression_model = lr.fit(train) results = linear_regression_model.evaluate(test) results.residuals.show() ## values that were failed results.rootMeanSquaredError ## Average of error in prediction, in our case we're handling are +/- 500, 10 is a low number results.r2 ## the result gives us that is a good model, +/- 98% accurate prediction
def main(input_path, output_attribute_index, scikit_output_path, spark_output_path): # Instancira se Passive Aggressive Regressor model regressor = PassiveAggressiveRegressor() for file_path in hdfs.ls(input_path): # Ucitava se sadrzaj fajla i kreira string matrica od njega content = hdfs.load(file_path) temp = content.split("\n") temp = list(map(lambda x: x.split(","), temp)) temp = list(filter(lambda x: len(x) > 1, temp)) raw_matrix = np.array(temp) # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti # koja se nakon toga koristi prilikom treniranja modela # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string') input_matrix = raw_matrix[1:, 3:-5].astype('float64') output_vector = raw_matrix[1:, -5 + output_attribute_index].astype('float64') # Model se trenira u vidu iterativnog poboljsanja regressor.partial_fit(input_matrix, output_vector) # Na konzoli se stampa putanja do obradjenog fajla print(file_path) # Cuva se kreirani model na izlaznoj putanji # koja je prosledjena u vidu argumenta with hdfs.open(scikit_output_path, 'w') as opened_file: pickle.dump(regressor, opened_file) # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije configuration = SparkConf().setAppName("BigDataProj3_Trainer") context = SparkContext(conf=configuration) context.setLogLevel("ERROR") # Inicijalizacija sesije # (mora da se obavi zbog upisivanja modela) session = SparkSession(context) # Ucitavanje RDD podataka sa ulazne putanje input_data = context.textFile(input_path) # Parsiranje svakog reda na reci input_data = input_data.map(lambda x: x.split(",")) # Ignorisu se header-i input_data = input_data.filter(lambda x: x[0] != "Timestamp") # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude) # i bira se odgovarajuca izlazna kolona # (u zavisnosti od output_attribute_index promenljive) input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[ 3:-5])) + [float(x[-5 + output_attribute_index])]) # Formira se odgovarajuci DataFrame objekat # (VectorAssembler se koristi kod formiranja kolona # koje omogucavaju koriscenje fit metode linearne regresije) input_cols = [] for i in range(15): input_cols.append("_" + str(i + 1)) assembler = VectorAssembler(inputCols=input_cols, outputCol='features') data_frame = assembler.transform(input_data.toDF()) # Instancira se LinearRegression objekat i vrsi njegovo treniranje # i zatim cuvanje na zadatoj putanji regression = LinearRegression(featuresCol='features', labelCol='_16') model = regression.fit(data_frame) model.write().overwrite().save(spark_output_path)
from pyspark.ml.regression import LinearRegression pp_df = spark.read.csv( "/Users/danemorgan/Documents/DataScience/CCPP/powerplant.csv", header="True", inferSchema=True) pp_df from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler(inputCols=["AT", "V", "AP", "RH"], outputCol="features") vpp_df = vectorAssembler.transform(pp_df) vpp_df.take(1) LR = LinearRegression(featuresCol="features", labelCol="PE") lr_model = LR.fit(vpp_df) lr_model.coefficients #should output: DenseVector([-1.9775, -0.2339, 0.0621, -0.1581]) lr_model.intercept #should output: 454.6092744523414 lr_model.summary.rootMeanSquaredError #should output: 4.557126016749488 lr_model.save("linearRegression1.model")
from pyspark.ml.linalg import Vectors ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[2:-2]), x[-1]]).toDF( ['features', 'crew']) ad_df.show(10) # In[4]: # Build linear regression model from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol='features', labelCol='crew') # In[5]: # Fit the model lr_model = lr.fit(ad_df) # In[6]: # Prediction pred = lr_model.transform(ad_df) pred.show(10) # In[7]: # Module evaluation from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='crew') evaluator.evaluate(pred)
trainingData = spark_sql_output.rdd.map( lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) trainingData.show() featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) (trainingData, testData) = trainingData.randomSplit([0.7, 0.3]) #################### SPARK ML #################### # Define LinearRegression algorithm lr = LinearRegression() # Fit 2 models, using different regularization parameters modelA = lr.fit(trainingData, {lr.regParam: 0.0}) modelB = lr.fit(trainingData, {lr.regParam: 100.0}) # Make predictions predictionsA = modelA.transform(trainingData) print('-' * 70) print('MODEL A : ') predictionsA.select("prediction", "label", "features").show(30) print('-' * 70) predictionsB = modelB.transform(trainingData) print('-' * 70) print('MODEL B : ') predictionsB.select("prediction", "label", "features").show(30) print('-' * 70)
df = spark.read.load("/data/regression") # COMMAND ---------- from pyspark.ml.regression import LinearRegression lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) print lr.explainParams() lrModel = lr.fit(df) # COMMAND ---------- summary = lrModel.summary summary.residuals.show() print summary.totalIterations print summary.objectiveHistory print summary.rootMeanSquaredError print summary.r2 # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams()
d.pop('success_metric', None) values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data return (pred, Vectors.dense(values)) # training set trainParsed = sc.parallelize(map(parsePoint, train_dict)) # test set testParsed = sc.parallelize(map(parsePoint, test_dict)) ## create validation set trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"]) testDf = sqlContext.createDataFrame(testParsed, ["label", "features"]) lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6) lm_model_fit = lm_model.fit(trainDf) lm_transform = lm_model_fit.transform(trainDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression training Mean Squared Error = " + str(MSE)) lm_transform = lm_model_fit.transform(testDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression testing Mean Squared Error = " + str(MSE)) res = results.collect() predsAndLabels = sc.parallelize([i.asDict().values() for i in res]) metrics = RegressionMetrics(predsAndLabels)
from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() data = spark.read.csv("mlinput.csv", inferSchema=True) data.printSchema() feature_columns = data.columns[1:] # Need to install numpy if haven't. # Command: pip install numpy from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") transformed_data = assembler.transform(data) train, test = transformed_data.randomSplit([0.8, 0.2]) from pyspark.ml.regression import LinearRegression linearregression = LinearRegression(featuresCol="features", labelCol="_c0") model = linearregression.fit(train) predictions = model.transform(test) predictions.show()
# prepare data frame as required by MLLib data = spark.sparkContext.parallelize(ratingsPerDayDict.items()) \ .map(lambda x: (float(x[1]), Vectors.dense(float(x[0])))) df = data.toDF(["label", "features"]) # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create the linear regression model lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train the model using our training data model = lir.fit(trainingDF) # Generate predictions for test data using our linear regression model fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect() # Print out the predicted and actual values for each point for prediction in predictionAndLabel: print(prediction)
training = spark.read.format('libsvm').load( '/FileStore/tables/sample_linear_regression_data.txt') # COMMAND ---------- training.show() # COMMAND ---------- lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction') # COMMAND ---------- lrModel = lr.fit(training) # COMMAND ---------- lrModel.coefficients # COMMAND ---------- training_summary = lrModel.summary # COMMAND ---------- training_summary.rootMeanSquaredError # COMMAND ----------
# MAGIC # MAGIC **References** # MAGIC * [MLlib LinearRegression user guide](http://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression) # MAGIC * [PySpark LinearRegression API](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression) # COMMAND ---------- # Import LinearRegression class from pyspark.ml.regression import LinearRegression # Define LinearRegression algorithm lr = LinearRegression() # COMMAND ---------- # Fit 2 models, using different regularization parameters modelA = lr.fit(dataset, {lr.regParam:0.0}) modelB = lr.fit(dataset, {lr.regParam:100.0}) print(">>>> ModelA intercept: %r, coefficient: %r" % (modelA.intercept, modelA.coefficients[0])) print(">>>> ModelB intercept: %r, coefficient: %r" % (modelB.intercept, modelB.coefficients[0])) # COMMAND ---------- # MAGIC %md ## Make predictions # MAGIC # MAGIC Calling `transform()` on data adds a new column of predictions. # COMMAND ---------- # Make predictions predictionsA = modelA.transform(dataset) display(predictionsA)
import pyspark.sql.functions as F from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import SparseVector spark = SparkSession.builder.appName("Regression").getOrCreate() df = spark.read.format("csv").option("header", True)\ .option("inferSchema", True).option("delimiter", ",")\ .load("imports-85.data") data = df.withColumnRenamed("wheel-base", "label").select("label", "length", "width", "height") data.show() from pyspark.ml.regression import LinearRegression assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") y = assembler.transform(data) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(y) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model.coefficients)) print("Intercept: %s" % str(model.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = model.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) from pyspark.sql.functions import col, when logistic_df = df.withColumn( "label", when(col("num-of-doors") == "four", 1).otherwise(0)).select("label", "length", "width", "height")
'''see the vectorized feature''' output.select("Independent Features").show() output.columns '''get the sorted column''' finalized_data = output.select("Independent Features", "Close") finalized_data.show() '''Divide the data for Training and Testing''' train_data, test_data = finalized_data.randomSplit([0.75, 0.25]) '''BUILDING MODEL''' '''USe linear regression alogorithm for model fiting''' from pyspark.ml.regression import LinearRegression regressor = LinearRegression(featuresCol='Independent Features', labelCol='Close') regressor = regressor.fit(train_data) lr = LinearRegression(featuresCol='Independent Features', labelCol='Close', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_data) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) '''TESTING''' '''testing the data get the accuracy by using root mean square ''' lr_predictions = lr_model.transform(test_data) lr_predictions.select("Close", "Independent Features", "prediction").show(5) '''EVALUATION'''
train_df.show() test_df.show() # # Modelos de regresión sobre el precio de los artículos # # Modelo de Regresion Lineal # In[129]: # se crea y entrena el modelo lr = LinearRegression(featuresCol='features', labelCol='price', maxIter=100, regParam=0.2, elasticNetParam=0.2) lr_model = lr.fit(train_df) # ahora se pueden hacer algunas predicciones y evaluar el desempeño lr_predictions = lr_model.transform(test_df) test_prediction = lr_predictions.select("prediction", "price") test_prediction.show() evaluator = RegressionEvaluator(labelCol="price") print("\nModelo de Regresión Lineal") print("R Squared (R2) on test data = %g" % evaluator.evaluate(test_prediction, {evaluator.metricName: "r2"})) print("Root Mean Squared Error (RMSE) on test data = %g" % evaluator.evaluate(test_prediction, {evaluator.metricName: "rmse"}))
from pyspark.ml.feature import VectorAssembler from pyspark.ml.evaluation import RegressionEvaluator from pyspark.mllib.evaluation import RegressionMetrics # Load training data training = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load('file:///home/pkatta/Downloads/spark/CASP.csv') vecAssembler = VectorAssembler( inputCols=["F1", "F2", "F3", "F5", "F6", "F7", "F8", "F9"], outputCol="features") t = vecAssembler.transform(training) (trainingData, testData) = t.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol="features", labelCol="RMSD", predictionCol="prediction") lrModel = lr.fit(t) # Print the coefficients and intercept for linear regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) #valuator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
# Load the JSON strings as a Spark Dataframe. natality_data = spark.read.json(table_json) # Create a view so that Spark SQL queries can be run against the data. natality_data.createOrReplaceTempView("natality") # As a precaution, run a query in Spark SQL to ensure no NULL values exist. sql_query = """ SELECT * from natality where weight_pounds is not null and mother_age is not null and father_age is not null and gestation_weeks is not null """ clean_data = spark.sql(sql_query) # Create an input DataFrame for Spark ML using the above function. training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label", "features"]) training_data.cache() # Construct a new LinearRegression object and fit the training data. lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal") model = lr.fit(training_data) # Print the model summary. print "Coefficients:" + str(model.coefficients) print "Intercept:" + str(model.intercept) print "R^2:" + str(model.summary.r2) model.summary.residuals.show()
# Top 10 correlated Crime Types= ['OTHERTRAFFICINFRACTION', 'VEHICLEANDTRAFFICLAWS', 'CRIMINALTRESPASS', 'DANGEROUSDRUGS', 'INTOXICATED&IMPAIREDDRIVING', 'OTHEROFFENSESRELATEDTOTHEFT', 'THEFT-FRAUD', 'GRANDLARCENY', 'OTHERSTATELAWS', 'PARKINGOFFENSES'] ####### Linear regression to validate Hypothesis Testing between every crime type and unemployment rate ####### for i in range(len(total_columns)): columns = [total_columns[i]] mergedDf = mergedDfTotal.select(['UnemplymentRate'] + columns) ####### Validated results with and without normalisng data per crime, rss seems to do better without normalising so ignoring normalizing using MinMaxScaler####### assembler = VectorAssembler(inputCols=['UnemplymentRate'], outputCol="features") vgrouped_arrests_unemp = assembler.transform(mergedDf) lr = LinearRegression(featuresCol='features', labelCol=columns[0], maxIter=200, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(vgrouped_arrests_unemp) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) trainingSummary = lr_model.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) vgrouped_arrests_unemp.describe().show() lr_predictions = lr_model.transform(vgrouped_arrests_unemp) output = np.array( lr_predictions.select(["prediction"] + ['UnemplymentRate'] + columns).collect()) X, y, y_pred = output[:, 2].reshape(-1, 1), output[:, 1].reshape( -1, 1), output[:, 0].reshape(-1, 1) coefficient_significance = slope_significance_hyp_testing( X, y, y_pred, np.array(lr_model.coefficients).reshape(-1, 1), correlation[i])
float(p[7]), float(p[8]), float(p[9]), float(p[10]) ]))) # In[36]: # Create the data frame containing the training data having two columns. 1) The actula output or label of the data 2) The vector containing the features trainingDF = spark.createDataFrame(wineDataRDD, ['label', 'features']) trainingDF.show() # Create the object of the algorithm which is the Linear Regression with the parameters # Linear regression parameter to make lr.fit() use at most 10 iterations lr = LinearRegression(maxIter=10) # Create a trained model by fitting the parameters using the training data model = lr.fit(trainingDF) # In[37]: # Once the model is prepared, to test the model, prepare the test data containing the labels and feature vectors testDF = spark.createDataFrame( [(5.0, Vectors.dense( [7.4, 0.7, 0.0, 1.9, 0.076, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8])), (5.0, Vectors.dense( [7.8, 0.88, 0.0, 2.6, 0.098, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4])), (7.0, Vectors.dense( [7.3, 0.65, 0.0, 1.2, 0.065, 15.0, 18.0, 0.9968, 3.36, 0.57, 9.5]))], ["label", "features"])
#Define Pipeline pipeline = Pipeline(stages=[ Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer, assembler, lr ]) # COMMAND ---------- ind_model = pipeline.fit(train) train_final = ind_model.transform(test) display(train_final) # COMMAND ---------- # Fit the model lrModel = lr.fit(train_final) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) # COMMAND ----------
df.drop('Date', axis=1, inplace=True) df.to_csv(os.path.join(dir_path, 'temp_' + key['Key']), index=False, sep=' ', header=False) data = spark.read.format("libsvm") \ .load(os.path.join(dir_path,'temp_' + key['Key'])) test_data = spark.read.format("libsvm").load("final.csv") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(data) real_preds = lrModel.transform(test_data) real_preds = real_preds.select(real_preds['features'], real_preds['prediction']) clean = udf(clean_features) real_preds = real_preds.select( clean(real_preds['features']).alias('Date'), real_preds['prediction'].alias('Value')) real_preds.show() real_preds.write.option("header", "false").csv("temp")
#Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) usdLP = usdVectors.map(transformationLR.transformToLabeledPoint) usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"]) usdDF.select("label", "features").show(10) #Split into training and testing data (trainingData, testData) = usdDF.randomSplit([0.7, 0.3]) trainingData.count() testData.count() #Build the model on training data lr = LinearRegression(maxIter=10) lrModel = lr.fit(trainingData) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) #Predict on the test data predictions = lrModel.transform(testData) predictions.select("prediction","label","features").show() evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="label",metricName="r2") evaluator.evaluate(predictions) #Streaming data from pyspark.streaming import StreamingContext ssc=StreamingContext(sc,1) inputStream=ssc.textFileStream("../Forex DT/data/1440/streaming1440.csv")
print (spark_sql_output.take(10)) trainingData=spark_sql_output.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) trainingData.show() featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) (trainingData, testData) = trainingData.randomSplit([0.7, 0.3]) #################### SPARK ML #################### # Define LinearRegression algorithm lr = LinearRegression() # Fit 2 models, using different regularization parameters modelA = lr.fit(trainingData, {lr.regParam:0.0}) modelB = lr.fit(trainingData, {lr.regParam:100.0}) # Make predictions predictionsA = modelA.transform(trainingData) print ('-'*70) print ('MODEL A : ') predictionsA.select("prediction", "label", "features").show(30) print ('-'*70) predictionsB = modelB.transform(trainingData) print ('-'*70) print ('MODEL B : ') predictionsB.select("prediction", "label", "features").show(30) print ('-'*70)
run.log("Model Name", model_name) run.log("Max Iterations", maxIters) run.log("Regularization Rate", regParam) run.log_list("Feature Columns", feature_cols) ############### # TRAIN MODEL # ############### print(" * Training {0} model".format(model_name)) # Instantiate New LinearRegression Object lr = LinearRegression(featuresCol='features', labelCol='duration_minutes', maxIter=maxIters, regParam=regParam, solver="auto") # Train model on transformed training data lr_model = lr.fit(trainDF_transformed) lr_full_model = feature_model.copy() lr_full_model.stages.append(lr_model) print(" * Model trained, scoring validation data") # Run the full model (feature steps and trained model) validation_scored = lr_full_model.transform(validDF) ##################### # MODEL PERFORMANCE # ##################### print(" * Calculating performance metrics") # Calculate Regression Performance rmse = evaluator.evaluate(validation_scored, {evaluator.metricName: "rmse"})
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
# in a DataFrame named `assembled` assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") assembled = assembler.transform(selected) # Split the `assembled` DataFrame into training and test # sets (train, test) = assembled.randomSplit([0.8, 0.2], 12345) # ## Specifying and training the model # instantiate the Spark MLlib linear regression estimator lr = LinearRegression(featuresCol="features", labelCol="weight") # Call the `fit` method to fit (train) the linear regression # model lr_model = lr.fit(train) # ## Evaluating the trained model # Generate predictions on the test set test_with_predictions = lr_model.transform(test) # Create an instance of `RegressionEvaluator` class evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="weight", metricName="r2") # Compute the R-squared evaluator.evaluate(test_with_predictions) # ## Interpreting the model
from vectorizer import VectorizeData from pyspark.ml.regression import LinearRegression from pyspark.sql.types import IntegerType if __name__ == "__main__": train, test = VectorizeData().get_train_test_data() reg = LinearRegression(featuresCol='features', labelCol='G3', maxIter=10, regParam=0.3, elasticNetParam=0.8) regModel = reg.fit(train) tSummary = regModel.summary print(tSummary.rootMeanSquaredError, tSummary.r2) ''' Mean Squared Error = 1.8853871486836264 r2 = 0.8266004476656317 ''' predictionDF = regModel.transform(test) predictionDF = predictionDF.withColumn( "prediction", predictionDF["prediction"].cast(IntegerType())) output_DF = predictionDF.select("prediction", "G3") output_DF.show() ''' +----------+---+ |prediction| G3| +----------+---+
assembler = VectorAssembler(inputCols=[ 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'ship_indexer', 'cruise_indexer' ], outputCol='features') output = assembler.transform(result) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) output_scaled = scaler.fit(output).transform(output) data = output_scaled.select('scaledFeatures', 'crew') train_data, test_data = data.randomSplit([0.7, 0.3]) lr_model = LinearRegression(featuresCol="scaledFeatures", labelCol="crew") model = lr_model.fit(train_data) test_results = model.evaluate(test_data) # test_results.residuals.show() print(test_results.rootMeanSquaredError) print(test_results.r2) data.describe().show() # check why model performs so well from pyspark.sql.functions import corr df.select(corr('crew', 'passengers')).show()
# COMMAND ---------- finalized_data = output.select('Features', 't2mTemp') finalized_data.show() # COMMAND ---------- # DBTITLE 1,Split data train_data, test_data = finalized_data.randomSplit([0.8, 0.2]) # COMMAND ---------- # DBTITLE 1,Train data with LR from pyspark.ml.regression import LinearRegression regressor = LinearRegression(featuresCol='Features', labelCol='t2mTemp') regressor = regressor.fit(train_data) # COMMAND ---------- # DBTITLE 1,Regression Coefficients regressor.coefficients # COMMAND ---------- regressor.intercept # COMMAND ---------- # DBTITLE 1,Evaluate model with test data pred_results = regressor.evaluate(test_data) pred_resultsTest = regressor.evaluate(finalized_dataTest)
#VECTORIZE TRAIN DATA energi_habis_train = ssc.textFileStream("train_habis.txt") energi_habis_train_labeled = energi_habis_train.map(parse_train) energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"]) print(energi_habis_train_labeled_DF) #VECTORIZE TEST DATA energi_habis_test = ssc.textFileStream("test_habis.txt") energi_habis_test_labeled = energi_habis_test.map(parse_test) energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"]) print(energi_habis_test_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_habis_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_habis_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)
#Find correlations numFeatures = autoDF.take(1)[0].features.size labelRDD = autoDF.map(lambda lp: float(lp.label)) for i in range(numFeatures): featureRDD = autoDF.map(lambda lp: lp.features[i]) corr = Statistics.corr(labelRDD, featureRDD, 'pearson') print('%d\t%g' % (i, corr)) #Split into training and testing data (trainingData, testData) = autoDF.randomSplit([0.9, 0.1]) trainingData.count() testData.count() #Build the model on training data from pyspark.ml.regression import LinearRegression lr = LinearRegression(maxIter=10) lrModel = lr.fit(trainingData) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) #Predict on the test data predictions = lrModel.transform(testData) predictions.select("prediction","label","features").show() from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="label",metricName="r2") evaluator.evaluate(predictions)
from pyspark.mllib.linalg import Vectors from pyspark.ml.regression import LinearRegression from pyspark.mllib.regression import LabeledPoint data= [LabeledPoint(0.0, Vectors.dense([0.0]),), LabeledPoint(0.99, Vectors.dense([1.0])), LabeledPoint(2.0, Vectors.dense([2.0])), LabeledPoint(3.01, Vectors.dense([3.0]))] training = sqlContext.createDataFrame(data) lr = LinearRegression(maxIter=100, regParam=0.05, elasticNetParam=0.8) lrModel = lr.fit(training) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept))
# Load data and select feature and label columns data = spark.read.format("csv").option("header", True).option( "inferSchema", True ).option("delimiter", ",").load( "/home/charan/workspaces/big_data_programming/bigdata_progamming_m2_icp/icp7/apps/datasets/imports-85.data" ) data = data.withColumnRenamed("symboling", "label").select("label", "length", "width", "height") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model model = lr.fit(data) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model.coefficients)) print("Intercept: %s" % str(model.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = model.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
# Load training data #data = spark.read.format("libsvm")\ # .load("sample_linear_regression_data.txt") # or read it from a local disk (if working with a local Spark) data = spark.read.format("libsvm")\ .load("file:///home/hadoop/spark/data/mllib/sample_linear_regression_data.txt") # split into training and test data (train, test) = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(train) print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) # Used to help if LR systematically over and under-predicts the data (bias) trainingSummary.residuals.show() # Root Mean Squared Error (RMSE) on test data print("RMSE: %f" % trainingSummary.rootMeanSquaredError) # R-squared = Explained variation / Total variation (between 0-100%) # R-squared cannot determine whether the coefficient estimates and # predictions are biased, which is why you must assess the residual plots. print("r2: %f" % trainingSummary.r2)
#VECTORIZE TRAIN DATA energi_terbarukan_train = sc.textFile("train_terbarukan.txt") energi_terbarukan_train_labeled = energi_terbarukan_train.map(parse_train) energi_terbarukan_train_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_train_labeled["label", "features"]) print(energi_terbarukan_train_labeled_DF) #VECTORIZE TEST DATA energi_terbarukan_test = ssc.textFileStream("test_terbarukan.txt") energi_terbarukan_test_labeled = energi_terbarukan_test.map(parse_test) energi_terbarukan_test_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_test_labeled["label", "features"]) print(energi_terbarukan_train_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_terbarukan_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_terbarukan_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)