def test_java_object_gets_detached(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) summary = model.summary self.assertIsInstance(model, JavaWrapper) self.assertIsInstance(summary, JavaWrapper) self.assertIsInstance(model, JavaParams) self.assertNotIsInstance(summary, JavaParams) error_no_object = 'Target Object ID does not exist for this gateway' self.assertIn("LinearRegression_", model._java_obj.toString()) self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) model.__del__() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) try: summary.__del__() except: pass with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString()
def train(self, rdd): """ This ignores the optimizer parameter since it makes config difficult for Linear Regression. :return: Trained model to be passed to test. """ options = self.options if options.loss == "l2": if options.reg_type in ["none", "l1", "l2"]: return LinearRegressionWithSGD.train(data=rdd, iterations=options.num_iterations, step=options.step_size, miniBatchFraction=1.0, regParam=options.reg_param, regType=options.reg_type) elif options.reg_type == "elastic-net": # use spark.ml lr = MLLinearRegression(maxIter=options.num_iterations, regParam=options.reg_param, elasticNetParam=options.elastic_net_param) # TODO: Do not include time for conversion to DataFrame (but this currently matches # the Scala tests) df = rdd.toDF() lrModel = lr.fit(df) return LinearRegressionModel(lrModel.weights, lrModel.intercept) else: raise Exception("GLMRegressionTest cannot run with loss = %s, reg_type = %s" \ % (options.loss, options.reg_type)) else: raise Exception("GLMRegressionTest does not recognize loss: %s" % options.loss)
def test_linear_regression_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=1) model = lr.fit(df) path = tempfile.mkdtemp() lr_path = path + "/lr-pmml" model.write().format("pmml").save(lr_path) pmml_text_list = self.sc.textFile(lr_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def test_linear_regression(self): lr = LinearRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/lr" lr.save(lr_path) lr2 = LinearRegression.load(lr_path) self.assertEqual(lr2.uid, lr2.maxIter.parent, "Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LinearRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_linear_regression_with_huber_loss(self): data_path = "data/mllib/sample_linear_regression_data.txt" df = self.spark.read.format("libsvm").load(data_path) lir = LinearRegression(loss="huber", epsilon=2.0) model = lir.fit(df) expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537, 1.2612, -0.333, -0.5694, -0.6311, 0.6053] expectedIntercept = 0.1607 expectedScale = 9.758 self.assertTrue( np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3)) self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3)) self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
def test_linear_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertAlmostEqual(s.explainedVariance, 0.25, 2) self.assertAlmostEqual(s.meanAbsoluteError, 0.0) self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) self.assertAlmostEqual(s.r2adj, 1.0, 2) self.assertTrue(isinstance(s.residuals, DataFrame)) self.assertEqual(s.numInstances, 2) self.assertEqual(s.degreesOfFreedom, 1) devResiduals = s.devianceResiduals self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class LinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler(inputCols=['client_port', 'Interval'], outputCol='Duration') vdf = vectorAssembler.transform(newdf) vdf = vdf.select(['features', 'Duration']) ## Dividing Dataframe into two parts splits = newdf.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] ## Building the model from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol='features', labelCol='Duration', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) ## Predictions lr_predictions = lr_model.transform(test_df) ######### Solution to Problem 3 ########### ## This problem can be addressed similarly to how we predicted session length for an IP ## Here we will need to calculate unique URL visits instead so session duration, then a similar Linear Regression model can be used. #### Aggregating data to Client_port, Interval and number of unique URL visits level. df_date = df.withColumn('Date', split_col.getItem(0))
# Let's define our vector with only the features we actually want to use to build the model # We'll ignore the columns above that are highly correlated to one another. # Note that waterfront is treated as a boolean, so we didn't have to encode it. # We can just add it to the vector assembler. assembler = VectorAssembler( inputCols=["bedrooms", "bathrooms", "sqft_living", "sqft_above_percentage", "floors", "condition_vector", "grade_vector", "zipcode_vector", "waterfront"], outputCol="features") # Build a Grid of Hyperparameters to test # Here we build a Grid of hyperparameters so we can test all permutations # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. lr = LinearRegression() paramGridBuilder = ParamGridBuilder() paramGrid = paramGridBuilder\ .addGrid(lr.regParam, [0.01, 0.1, 0.5])\ .addGrid(lr.elasticNetParam, [0, 0.5, 1])\ .build() # Split data into Training and Testing chunks, and prepare to build model # In order to test many hyperparameter combinations and choose the best-performing model, we use a # TrainValidationSplit object. # TrainValidationSplit requires us to supply 4 parameters: # 1. An estimator. This is the model builder we will use. In our case, it is a LinearRegression object # 2. An evaluator. This tells how we want to evaluate results to determine which model is best. # 3. An estimatorParamMaps. This is the ParamGrid object with all the hyperparameter values
from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import LinearRegression N_OF_CLUSTERS = 10 # number of clusters included N_OF_TIME_SLOTS = 14400 # number of time slots that are being used for training TIME_SLOTS_WITHIN_DAY = 144 # day is divided into that number of slots FIRST_DAY_DAY_OF_WEEK = 3 # which day of the week was the first day of the year 2015 (0 - Monday, 1 - Tuesday, etc.) spark = SparkSession.builder.master('spark://172.25.24.242:7077').getOrCreate() sqlCtx = SQLContext(spark.sparkContext, spark) invDemandCache.init(spark, sqlCtx) assembler = VectorAssembler(inputCols=["day_of_week", "time_of_day"], outputCol="features") lr = LinearRegression(labelCol='demand') def get_data(start_time, end_time, cluster): rows = [] for tid in range(start_time, end_time): demand = invDemandCache.get_demand(tid, cluster) day_of_week = (FIRST_DAY_DAY_OF_WEEK + int(tid // 144)) % 7 time_of_day = tid % 144 rows.append((day_of_week, time_of_day, demand)) if tid % 100 == 0: print(tid) df = spark.createDataFrame(rows, ["day_of_week", "time_of_day", "demand"]) output = assembler.transform(df) return output.select('features', 'demand')
from pyspark.ml.linalg import Vectors from pyspark.ml.regression import LinearRegression from pyspark import SparkContext from pyspark import SQLContext sc = SparkContext("local","Simple App") sqlContext = SQLContext(sc) df = sqlContext.createDataFrame([ (1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(df) test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) print abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
stringIndexer = StringIndexer(inputCol=item, outputCol=item + ' index').fit(df).transform(df) encoder = OneHotEncoder(inputCol=item + ' index', outputCol=item + ' onehot').transform(stringIndexer).select( 'Id', item + ' onehot') df = df.drop(item) df_str = df_str.join(encoder, 'Id') # the output of one hot encoding is a vector # unlike r or python, which ask input should be a matrix with # many columns. The each line of MLlib features input is a vector. df = df.join(df_str, 'Id', 'inner') df_price = df.select('Id', 'SalePrice') df_variable = df.drop('SalePrice') assembler = VectorAssembler( inputCols=df_variable.columns, outputCol='features') # Assemble all vectors together as input output = assembler.transform(df) input_data = output.select('SalePrice', 'features') input_data = input_data.selectExpr("SalePrice as label", 'features as features') lr = LinearRegression(maxIter=100, regParam=0, elasticNetParam=0.8) # linear model and parameters # Fit the model lrModel = lr.fit(input_data) # model fit on data print("Coefficients: " + str(lrModel.coefficients)) # print parameters print("Intercept: " + str(lrModel.intercept)) # print intercept
"followers", "friends", "favorited", "status_count", "region_id", "user_desc_rating", "count" ], outputCol="feat_vector") # assembler = VectorAssembler(inputCols=["region_id", "user_desc_rating", "count"], outputCol="feat_vector") featured_data = assembler.transform(raw_data.na.fill(0)) featured_data = featured_data.filter(featured_data.user_desc_rating != 0.0) train, test = featured_data.randomSplit([.8, .2], 0) featuresScaler = StandardScaler(inputCol="feat_vector", outputCol="features") featuresModel = featuresScaler.fit(train) scTrain = featuresModel.transform(train) scTest = featuresModel.transform(test) # Train model lr = LinearRegression(labelCol="tweet_rating") lrModel = lr.fit(scTrain) # Model and Training info print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) # Compute simple error tested = lrModel.transform(scTest) err = tested.select('prediction').subtract(tested.select('tweet_rating'))
df = spark.createDataFrame(input_data, ["label", "features"]) # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = standardScaler.fit(df) scaled_df = scaler.transform(df) # Transform the data in `df` with the scaler train_data, test_data = scaled_df.randomSplit([.8, .2], seed=1234) # Initialize `lr` lr = LinearRegression(labelCol="label", maxIter=100, regParam=0.3, elasticNetParam=0.8) # Fit the data to the model linearModel = lr.fit(train_data) #Lets run this on our test dataset predicted = linearModel.transform(test_data) # Extract the predictions and the "known" correct labels predictions = predicted.select("prediction").rdd.map(lambda x: x[0]) labels = predicted.select("label").rdd.map(lambda x: x[0]) # Zip `predictions` and `labels` into a list predictionAndLabel = predictions.zip(labels).collect()
#VECTORIZE TRAIN DATA energi_nuclear_train = ssc.textFileStream("train_nuclear.txt") energi_nuclear_train_labeled = energi_nuclear_train.map(parse_train) energi_nuclear_train_labeled_DF = SQLContext.createDataFrame(energi_nuclear_train_labeled["label", "features"]) print(energi_nuclear_train_labeled_DF) #VECTORIZE TEST DATA energi_nuclear_test = ssc.textFileStream("test_nuclear.txt") energi_nuclear_test_labeled = energi_nuclear_test.map(parse_test) energi_nuclear_test_labeled_DF = SQLContext.createDataFrame(energi_nuclear_test_labeled["label", "features"]) print(energi_nuclear_test_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_nuclear_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_nuclear_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)
def main(argv=None): if argv is None: inputs_train = sys.argv[1] inputs_test = sys.argv[2] conf = SparkConf().setAppName('sentiment-analysis-word2vec') sc = SparkContext(conf=conf) sqlCt = SQLContext(sc) #read train json file and prepare data (label, feature) text = sqlCt.read.json(inputs_train) train = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') train.cache() ## DATA PROCESSING PIPELINE # Split at whitespace and characters that are not letter tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\P{Alpha}+") # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") # Word2Vec Features - default: vector length 100 word2Vec = Word2Vec(inputCol="filtered_words", outputCol="features") pipeline_data_processing = Pipeline(stages=[tokenizer, remover, word2Vec]) model_data_processing = pipeline_data_processing.fit(train) train_processed = model_data_processing.transform(train) train.unpersist() train_processed.cache() ## ML PIPELINE # linear Regression Model lr = LinearRegression(maxIter=20, regParam=0.1) # FIT MODEL USING CROSS VALIDATION # Parameter grid for cross validation: numFeatures and regParam paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \ .build() # 5-fold cross validation evaluator = RegressionEvaluator(metricName="rmse") crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross-validation, and choose the best set of parameters. model = crossval.fit(train_processed) # RMSE on train data prediction_train = model.transform(train_processed) rmse_train = evaluator.evaluate(prediction_train) train_processed.unpersist() ## EVALUATION ON TEST DATA #read test json file and prepare data (label, feature) text = sqlCt.read.json(inputs_test) test = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') test_processed = model_data_processing.transform(test) # Evaluate the model on test data prediction_test = model.transform(test_processed) rmse_test = evaluator.evaluate(prediction_test) # Print Result result = "MODEL WITH Word2Vec features:\n" result = result + "-Train RMSE: " + str(rmse_train) + "\n" result = result + "-Test RMSE: " + str(rmse_test) + "\n" print(result)
# COMMAND ---------- # MAGIC %md Spark MLLib supports both `regressors` and `classifiers`, in this example you will use linear regression. Once you create the `regressor` you will train it, and it will return a `Model`. The `Model` will be the object you use to make predictions. # MAGIC # MAGIC * Create an instance of the `LinearRegression` algorithm called `lrModel`: # MAGIC * Set the label column to "count" # MAGIC * Set the features column to "features" # MAGIC * Set the "ElasticNetParam" to 0.5 (this controlls the mix of l1 and l2 regularization--we'll just use an equal amount of each) # MAGIC * Print the results of calling `explainParams` on `lrModel`. This will show you all the possible parameters, and whether or not you have customized them. # COMMAND ---------- from pyspark.ml.regression import LinearRegression lrModel = LinearRegression()\ .setLabelCol("count")\ .setFeaturesCol("features")\ .setElasticNetParam(0.5) print("Printing out the model Parameters:") print("-" * 20) print(lrModel.explainParams()) print("-" * 20) # COMMAND ---------- # MAGIC %md # MAGIC * Use the `fit` method on `lrModel` to provide the `training` dataset for fitting. # MAGIC * Store the results in `lrFitted`. # COMMAND ----------
Statistics.corr(usdVectors) #Transform to a Data Frame for input to Machine Learing #Drop columns that are not required (low correlation) usdLP = usdVectors.map(transformationLR.transformToLabeledPoint) usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"]) usdDF.select("label", "features").show(10) #Split into training and testing data (trainingData, testData) = usdDF.randomSplit([0.7, 0.3]) trainingData.count() testData.count() #Build the model on training data lr = LinearRegression(maxIter=10) lrModel = lr.fit(trainingData) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) #Predict on the test data predictions = lrModel.transform(testData) predictions.select("prediction","label","features").show() evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="label",metricName="r2") evaluator.evaluate(predictions) #Streaming data from pyspark.streaming import StreamingContext ssc=StreamingContext(sc,1)
df = spark.read.load("/data/regression") # COMMAND ---------- from pyspark.ml.regression import LinearRegression lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8) print lr.explainParams() lrModel = lr.fit(df) # COMMAND ---------- summary = lrModel.summary summary.residuals.show() print summary.totalIterations print summary.objectiveHistory print summary.rootMeanSquaredError print summary.r2 # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams()
pred = d_copy['success_metric'] d.pop('success_metric', None) values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data return (pred, Vectors.dense(values)) # training set trainParsed = sc.parallelize(map(parsePoint, train_dict)) # test set testParsed = sc.parallelize(map(parsePoint, test_dict)) ## create validation set trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"]) testDf = sqlContext.createDataFrame(testParsed, ["label", "features"]) lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6) lm_model_fit = lm_model.fit(trainDf) lm_transform = lm_model_fit.transform(trainDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression training Mean Squared Error = " + str(MSE)) lm_transform = lm_model_fit.transform(testDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression testing Mean Squared Error = " + str(MSE)) res = results.collect() predsAndLabels = sc.parallelize([i.asDict().values() for i in res]) metrics = RegressionMetrics(predsAndLabels)
# Load the JSON strings as a Spark Dataframe. natality_data = spark.read.json(table_json) # Create a view so that Spark SQL queries can be run against the data. natality_data.createOrReplaceTempView("natality") # As a precaution, run a query in Spark SQL to ensure no NULL values exist. sql_query = """ SELECT * from natality where weight_pounds is not null and mother_age is not null and father_age is not null and gestation_weeks is not null """ clean_data = spark.sql(sql_query) # Create an input DataFrame for Spark ML using the above function. training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label", "features"]) training_data.cache() # Construct a new LinearRegression object and fit the training data. lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal") model = lr.fit(training_data) # Print the model summary. print "Coefficients:" + str(model.coefficients) print "Intercept:" + str(model.intercept) print "R^2:" + str(model.summary.r2) model.summary.residuals.show()
# ## Requerimeintos de afinamiento de hyper parametros # Necesitamos especificar cuatro componentes para realizar el ajuste de hiperparámetros usando # busqueda de la malla: # * Estimator (i.e. machine learning algorithm) # * Hyperparameter grid # * Evaluator # * Validation method # ## Specify the estimator # En este ejemplo usaremos la regresion lineal lasso para nuestro estimador : from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol="features", labelCol="star_rating", elasticNetParam=1.0) # Usar el metodo `explainParams` para ver la lista de los hiperparametros: print(lr.explainParams()) # Configurar `elasticNetParam=1.0` corresponde al modelo Lasso $l1$ (lasso) de regresion linear # Queremos encontrar un valor razonable de ese parametro que esta en el objeto `regParam`. # [Elastic_net](https://en.wikipedia.org/wiki/Elastic_net_regularization) # ## Especificar un grid de parametros # # usar la clase para especificar el grid (malla) de hiperparametros # [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder) from pyspark.ml.tuning import ParamGridBuilder regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
ZN_, price_ FROM temp_sql_table """) print (spark_sql_output.take(10)) trainingData=spark_sql_output.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) trainingData.show() featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) (trainingData, testData) = trainingData.randomSplit([0.7, 0.3]) #################### SPARK ML #################### # Define LinearRegression algorithm lr = LinearRegression() # Fit 2 models, using different regularization parameters modelA = lr.fit(trainingData, {lr.regParam:0.0}) modelB = lr.fit(trainingData, {lr.regParam:100.0}) # Make predictions predictionsA = modelA.transform(trainingData) print ('-'*70) print ('MODEL A : ') predictionsA.select("prediction", "label", "features").show(30) print ('-'*70) predictionsB = modelB.transform(trainingData) print ('-'*70) print ('MODEL B : ')
# copy data from a local disk to HDFS ## old hadoop fs -put ./spark/data/mllib/ridge-data/lpsa.data /user/hadoop/lpsa.data #$ hadoop fs -put ./spark/data/mllib/sample_linear_regression_data.txt /user/hadoop/ # Load training data #data = spark.read.format("libsvm")\ # .load("sample_linear_regression_data.txt") # or read it from a local disk (if working with a local Spark) data = spark.read.format("libsvm")\ .load("file:///home/hadoop/spark/data/mllib/sample_linear_regression_data.txt") # split into training and test data (train, test) = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(train) print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) # Used to help if LR systematically over and under-predicts the data (bias) trainingSummary.residuals.show() # Root Mean Squared Error (RMSE) on test data print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
#VECTORIZE TRAIN DATA energi_habis_train = ssc.textFileStream("train_habis.txt") energi_habis_train_labeled = energi_habis_train.map(parse_train) energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"]) print(energi_habis_train_labeled_DF) #VECTORIZE TEST DATA energi_habis_test = ssc.textFileStream("test_habis.txt") energi_habis_test_labeled = energi_habis_test.map(parse_test) energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"]) print(energi_habis_test_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_habis_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_habis_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)
# ## Load Dataset # Loading the data set with the spark method. # In[3]: # Load training data df = spark.read.format("csv").option("header", "true").load("data/train.csv") # ## Preparing Data # Computing the LinearRegression model # In[4]: lr = LinearRegression() # It's crutial to see the type of the attributes to check if it's needed to process some categorical attributes, but not in this case. # In[5]: df.dtypes # It's necessary to cast all the attributes to float because it's one of the datatypes that Spark works with, because Spark doesn't work with strings. # In[6]: df = df.select( df['fare_amount'].cast("float").alias('fare_amount'), df['pickup_longitude'].cast("float").alias('pickup_longitude'), df['pickup_latitude'].cast("float").alias('pickup_latitude'),
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
# Fit the DataFrame to the scaler ini = time.time() scaler = standardScaler.fit(df) # Transform the data in `df` with the scaler scaled_df_teste = scaler.transform(df) # Inspect the result #scaled_df.take(2) from pyspark.ml.regression import LinearRegression # Initialize `lr` lr = LinearRegression(labelCol="label", maxIter=1) # Fit the data to the model linearModel = lr.fit(scaled_df_treino) predicted = linearModel.transform(scaled_df_teste) # Extract the predictions and the "known" correct labels predictions = predicted.select("prediction").rdd.map(lambda x: x[0]) labels = predicted.select("label").rdd.map(lambda x: x[0]) # Zip `predictions` and `labels` into a list predictionAndLabel = predictions.zip(labels).collect() fim = time.time() print("TEMPO###################")
spark = spark = SparkSession.builder.appName('LineerRegresyon').getOrCreate() veri = spark.read.csv('Ecommerce Customers.csv', inferSchema=True, header=True) veri.printSchema() veri.show() veri.head() veri.show() from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=[ 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership' ], outputCol='features') VeriVec = assembler.setHandleInvalid("skip").transform(veri) VeriVec.show() VeriVec.printSchema() SonVeri = VeriVec.select('features', 'Yearly Amount Spent') egitimVeri, testVeri = SonVeri.randomSplit([0.6, 0.4]) egitimVeri.show() lr = LinearRegression(labelCol='Yearly Amount Spent') lrModel = lr.fit(egitimVeri) print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept)) sonuclar = lrModel.evaluate(testVeri) sonuclar.residuals.show() print("RMSE: {}".format(sonuclar.rootMeanSquaredError)) print("MSE: {}".format(sonuclar.meanSquaredError))
model_df = features_df.select('features', 'price') # 构建用于线性回归的数据模型 # 5-将数据划分为 训练数据和预测数据 train_df, test_df = model_df.randomSplit([0.7, 0.3]) # 训练数据和预测数据的比例为 7比3 print((train_df.count(), len(train_df.columns))) print((test_df.count(), len(test_df.columns))) # 6-构建线性回归模型 from pyspark.ml.regression import LinearRegression # 导入线性回顾库 print('-------------- 构建线性回归模型 ------------------') lin_Reg = LinearRegression(labelCol='price') # labelCol,相对于featrues列,表示要进行预测的列 lr_model = lin_Reg.fit( train_df) # 训练数据 ,fit返回一个 fitted model,即LineRegressionModel对象 print('{}{}'.format('方程截距:', lr_model.intercept)) # intercept 线性方程的截距。 print('{}{}'.format( '方程参数系数:', lr_model.coefficients)) # 回归方程中的,变量参数 ,这里分别对应var_1,var_2,var_3,var_4,var_5 training_predictions = lr_model.evaluate(train_df) # 查看预测数据 print('{}{}'.format('误差差值平方:', training_predictions.meanSquaredError)) # 误差值差值平方
def sliding_window_evaluation(dataframe, feature_columns, num_windows=5, test_size=0.2): ''' Takes an input dataframe, splits it into partitions, and performs a sliding window where each partition is split between a train/test set and a linear regression is trained and evaluated Meant for analyzing the performance of a time series regression forecasting model as a random split is not appropriate in a time series setting ''' from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Gathering statistics for window partitions and train/test splits total_rows = dataframe.count() window_size = round(total_rows / num_windows) num_training_rows = round((dataframe.count() * (1 - test_size)) / num_windows) # Creating a column for partition numbers dataframe = (dataframe.withColumn('window_num', ((sqlF.row_number().over(Window.orderBy('date_time_resampled')) - 1) / window_size) + 1) .withColumn('window_num', sqlF.floor(col('window_num')))) # Truncating to integers # Specifying the name of the column containing the label labelColumn = 'price' # Assembling the vectors and outputting the training set assembler = VectorAssembler( inputCols=feature_columns, outputCol='features') output = assembler.transform(dataframe) vectorizedDF = output.select('features', col(labelColumn).alias('label'), 'window_num') # Gathering the total RMSE from all windows total_RMSE = [] # Looping over windows, splitting into train/test sets, and training and evaluating a model on each set for window in range(1, num_windows+1): # Subsetting the dataframe into the window dataWindow = vectorizedDF.filter(col('window_num') == window).drop('window_num') # Splitting into train/testing sets trainWindow = sqlContext.createDataFrame(dataWindow.head(num_training_rows), dataWindow.schema) testWindow = dataWindow.subtract(trainWindow) # Fitting the model # Using L1 regularization for automatic feature selection lr = LinearRegression(elasticNetParam=1.0, regParam=0.03) model = lr.fit(trainWindow) # Gathering evaluation and summary metrics modelSummary = model.summary # Creating a plot of the predictions and actuals to see if there is a significant lag predictDF = model.transform(testWindow) # Generating predictions total_RMSE.append(testRMSE) fig, ax = plt.subplots() ax.plot(predictDF.select('label').collect(), label='Label') ax.plot(predictDF.select('prediction').collect(), label='Prediction') plt.legend() plt.title('Test Set: Predictions and Actuals') # Reporting results print('Window', window) print('Training Size:', trainWindow.count()) print('Testing Size:', testWindow.count()) print("r2: %f" % modelSummary.r2) print("Training RMSE: %f" % modelSummary.rootMeanSquaredError) plt.show() # Plot of actuals vs predictions print() print('Average RMSE for {0} windows: {1}'.format(num_windows, np.mean(total_RMSE)))
sc = SparkContext('local', 'lineregression') sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(fes) df.select(["features", "labels"]).toPandas() from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler(inputCols=['features'], outputCol='feature') vhouse_df = vectorAssembler.transform(df) vhouse_df = vhouse_df.select(['feature', 'labels']) vhouse_df.show(3) lr = LinearRegression(featuresCol='feature', labelCol='labels', maxIter=100, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(vhouse_df.randomSplit([0.9, 0.1])[0]) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) a = float(str(lr_model.coefficients[0])) b = float(str(lr_model.intercept)) import matplotlib.pyplot as plt plt.scatter(matrix[:, 0], matrix[:, 1]) plt.plot(matrix[:, 0], a * matrix[:, 0] + b, color='red')
def test_write_property(self): lr = LinearRegression(maxIter=1) self.assertTrue(isinstance(lr.write, MLWriter))
output = assembler.transform(indexed) output.show() output.select('features', 'crew').show() final_data = output.select('features', 'crew') final_data.describe().show() train_data, test_data = final_data.randomSplit([0.7, 0.3]) train_data.describe().show() test_data.describe().show() lr = LinearRegression(labelCol='crew') lrmodel = lr.fit(train_data) print("Coefficients {} Intercept{}".format(lrmodel.coefficients, lrmodel.intercept)) test_results = lrmodel.evaluate(test_data) print("RMSE{}".format(test_results.rootMeanSquaredError)) print("R2{}".format(test_results.r2)) shipdf.select(corr('crew', 'passengers')).show() spark.stop()
udf_strpTime_features = udf(strpDate_features, types.IntegerType()) udf_strpTime_trainlabel = udf(strpDate_trainlabel, types.IntegerType()) df_features = df_features.withColumn( 'realdate', udf_strpTime_features(df_features['date'])).drop('date') df_train_label = df_train_label.withColumn( 'realdate', udf_strpTime_trainlabel(df_train_label['date'])).drop('date') df_new = df_train_label.join(df_features, 'realdate') df_new = df_new.na.fill(0.0) train, validation = df_new.randomSplit([0.80, 0.20]) assembler = VectorAssembler(inputCols=[ 'realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20', 'e21', 'e22', 'e23', 'e24', 'e25', 'e26' ], outputCol='features') lr = LinearRegression(featuresCol='features', labelCol='label') pipeline = Pipeline(stages=[assembler, lr]) model = pipeline.fit(train) prediction = model.transform(validation) evaluator = RegressionEvaluator(predictionCol='prediction') res = evaluator.evaluate(prediction, {evaluator.metricName: 'mse'}) print(res)
# Get the count for distinct output classes distinct_classes = predictions.select("prediction").distinct() distinct_classes_count = distinct_classes.count() print("Number of Distinct classes:", distinct_classes_count) all_data_through_model = trained_model.transform(model_data) (train_data, test_data) = all_data_through_model.randomSplit([0.8, 0.2]) dict_lin_reg = {} best_lin_reg = {} output = {} for i in distinct_classes.collect(): print("Currently running for:", i[0]) required_dataframe = train_data.filter( train_data.prediction == i[0]).drop("prediction") temp_lin_reg = LinearRegression().setFeaturesCol("features").setLabelCol( "trip_duration") grid_builder = ParamGridBuilder() \ .addGrid(temp_lin_reg.regParam,[0.5,1,100,1000]) \ .addGrid(temp_lin_reg.elasticNetParam,[0.2,0.5,0.8,1]) \ .addGrid(temp_lin_reg.epsilon,[2,3,5,9,50]) \ .addGrid(temp_lin_reg.maxIter,[10, 20, 50, 75]) \ .build() cross_validator = CrossValidator(estimator=temp_lin_reg, estimatorParamMaps=grid_builder, evaluator=rmseEvaluator, numFolds=10) cv_model = cross_validator.fit(required_dataframe) dict_lin_reg[i[0]] = cv_model best_lin_reg[i[0]] = cv_model.bestModel output[i[0]] = best_lin_reg[i[0]].transform( test_data.filter(test_data.prediction == i[0]).drop("prediction"))
# Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): print(row) from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import LinearRegression from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit # Prepare training and test data. data = spark.read.format("libsvm") \ .load("sample_linear_regression_data.txt") train, test = data.randomSplit([0.9, 0.1], seed=12345) lr = LinearRegression(maxIter=10) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid,
# Define input path input_path = "C:\\Users\\Lenovo\\PycharmProjects\\M2_ICP7" # Load data and select feature and label columns data = spark.read.format("csv").option("header", True).option( "inferSchema", True).option("delimiter", ",").load(input_path + "\\car.csv") data = data.withColumnRenamed("wheel-base", "label").select("label", "length", "width", "height") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model model = lr.fit(data) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model.coefficients)) print("Intercept: %s" % str(model.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = model.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
################################ ML ################################### from pyspark.ml.linalg import Vectors test3 = test1.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF( ['Length', 'Speed']) test3.show(5) # In[ ]: from pyspark.ml.regression import LinearRegression # Load training data ##training = spark.read.format("libsvm")\ ## .load("data/mllib/sample_linear_regression_data.txt") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Fit the model ##lrModel = lr.fit(training) lrModel = lr.fit(test2) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(lrModel.coefficients)) print("Intercept: %s" % str(lrModel.intercept)) # Summarize the model over the training set and print out some metrics trainingSummary = lrModel.summary print("numIterations: %d" % trainingSummary.totalIterations) print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) trainingSummary.residuals.show() print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2)
from pyspark.mllib.linalg import Vectors from pyspark.ml.regression import LinearRegression from pyspark.mllib.regression import LabeledPoint data= [LabeledPoint(0.0, Vectors.dense([0.0]),), LabeledPoint(0.99, Vectors.dense([1.0])), LabeledPoint(2.0, Vectors.dense([2.0])), LabeledPoint(3.01, Vectors.dense([3.0]))] training = sqlContext.createDataFrame(data) lr = LinearRegression(maxIter=100, regParam=0.05, elasticNetParam=0.8) lrModel = lr.fit(training) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept))
StructField( "house_pricing", ArrayType( StructType([ StructField("beds", IntegerType(), False), StructField("baths", IntegerType(), False), StructField("sq__ft", IntegerType(), False), StructField("price", IntegerType(), False) ]))) ])) ]) df = (spark.createDataFrame(collection, sch).select( explode(col("capabilities.{0}".format( "house_pricing"))).alias("house_pricing")).withColumn( "beds", col("house_pricing.beds")).withColumn( "baths", col("house_pricing.baths")).withColumn( "sq__ft", col("house_pricing.sq__ft")).withColumn( "price", col("house_pricing.price"))) assembler = VectorAssembler(inputCols=["beds", "baths", "sq__ft"], outputCol="features") assembled_df = assembler.transform(df) lr = LinearRegression( maxIter=10).setLabelCol("price").setFeaturesCol("features") model = lr.fit(assembled_df) test_df = spark.createDataFrame((([1., 1., 70.]), ), ["beds", "baths", "sq__ft"]) assembled_test_df = model.transform(assembler.transform(test_df)) assembled_test_df.show(truncate=False)
# In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret. # In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data. # You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation. # The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test. # Instructions # 100 XP # Fit a linear regression model to the training data. # Calculate the RMSE on the testing data. # Look at the model coefficients. # Get the count of coefficients equal to 0. from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator # Fit Lasso model (α = 1) to training data regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train) # Calculate the RMSE on testing data rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test)) print("The test RMSE is", rmse) # Look at the model coefficients coeffs = regression.coefficients print(coeffs) # Number of zero coefficients zero_coeff = sum([beta == 0 for beta in regression.coefficients]) print("Number of ceofficients equal to 0:", zero_coeff)
def main(): st_time = datetime.now() #Start time # Start model training print('Model training has started...') beneficiary = spark.read.parquet(*beneficiary_files) inpatient = spark.read.parquet(*inpatient_files) inpatient = inpatient.fillna({'CLM_FROM_DT': '2008-01-01'}) outpatient = spark.read.parquet(*outpatient_files) beneficiary.printSchema() inpatient.printSchema() outpatient.printSchema() ben = beneficiary.select( col('DESYNPUF_ID').alias('PATIENT_ID'), 'BENE_BIRTH_DT', col('BENE_SEX_IDENT_CD').alias("GENDER"), col('BENE_RACE_CD').alias('RACE'), col('SP_STATE_CODE').alias('STATE'), 'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA', col('BENRES_IP').alias("ANNUAL_COST"), ) inp = inpatient.groupBy('DESYNPUF_ID').agg( count(when(col('ICD9_DGNS_CD_1') != 'nan', True)).alias('DX'), count(when(col('ICD9_PRCDR_CD_1') != 'nan', True)).alias('PX'), count(when(col('HCPCS_CD_1') != 'nan', True)).alias('HCPCS'), max("CLM_FROM_DT").alias("DATE"), ) inner_join = ben.join(inp, ben.PATIENT_ID == inp.DESYNPUF_ID, how='inner') timeDiff = (unix_timestamp('DATE', "yyyy-MM-dd HH:mm:ss") - unix_timestamp('BENE_BIRTH_DT', "yyyy-MM-dd HH:mm:ss")) inner_join = inner_join.withColumn("AGE_YRS", timeDiff / 60 / 60 / 24 / 365) df = inner_join.select('ANNUAL_COST', 'PATIENT_ID', 'AGE_YRS', 'GENDER', 'RACE', 'STATE', 'DX', 'PX', 'HCPCS', 'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA') df.filter(col("ANNUAL_COST") != 0).show() cat_cols = [ 'GENDER', 'RACE', 'STATE', 'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA' ] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in cat_cols ] pipeline = Pipeline(stages=indexers) indexed = pipeline.fit(df).transform(df) indexed.show() # creating vectors from features # Apache MLlib takes input if vector form assembler = VectorAssembler(inputCols=[ 'AGE_YRS', 'GENDER_index', 'RACE_index', 'STATE_index', 'DX', 'PX', 'HCPCS', 'SP_ALZHDMTA_index', 'SP_CHF_index', 'SP_CHRNKIDN_index', 'SP_CNCR_index', 'SP_COPD_index', 'SP_DEPRESSN_index', 'SP_DIABETES_index', 'SP_ISCHMCHT_index', 'SP_OSTEOPRS_index', 'SP_RA_OA_index', 'SP_STRKETIA_index' ], outputCol='features') output = assembler.transform(indexed) output.select('features', 'ANNUAL_COST').show(5) # output as below train_data, test_data = output.randomSplit([0.7, 0.3]) test_data.describe().show() # creating an object of class LinearRegression # object takes features and label as input arguments synpuf_lr = LinearRegression(featuresCol='features', labelCol='ANNUAL_COST') # pass train_data to train model trained_synpuf_model = synpuf_lr.fit(train_data) # evaluating model trained for Rsquared error synpuf_results = trained_synpuf_model.evaluate(train_data) print('Rsquared Error :', synpuf_results.r2) unlabeled_data = test_data.select('features') unlabeled_data.show(5) print('Processing predictions...') predictions = trained_synpuf_model.transform(unlabeled_data) predictions.show() test_df = test_data.toPandas() pred_df = predictions.toPandas() merged_df = test_df.merge(pred_df) merged_df.sort_values(by='prediction', ascending=False, inplace=True) merged_df.drop_duplicates(subset=merged_df.columns[1:9], inplace=True) merged_df.astype({'features': str, 'DX': 'int32', 'PX': 'int32', 'HCPCS': 'int32'}) \ .to_parquet(dir + "synpuf_ml_output.parquet", index=False) s3.upload_file(dir + "synpuf_ml_output.parquet", "cms-data-1", "Annual_Cost_Predictions/synpuf_ml_output.parquet") os.remove(dir + "synpuf_ml_output.parquet") print('\n Predictions Completed!') fin_time = datetime.now() execution_time = fin_time - st_time print('\n Total execution time: {0}'.format(str(execution_time))) logging(execution_time) return
date_indexer = StringIndexer(inputCol='Date of Transfer', outputCol='Date_of_TransferIndexed') date_indexer = date_indexer.fit(data) property_type_indexer = StringIndexer(inputCol='Property Type', outputCol='Property_typeIndexed') property_type_indexer = property_type_indexer.fit(data) olde_new_indexer = StringIndexer(inputCol='Old/New', outputCol='Old_NewIndexed') olde_new_indexer = olde_new_indexer.fit(data) town_indexer = StringIndexer(inputCol='Town/City', outputCol='TownIndexed') town_indexer = town_indexer.fit(data) district_indexer = StringIndexer(inputCol='District', outputCol='DistrictIndexed') district_indexer = district_indexer.fit(data) county_indexer = StringIndexer(inputCol='County', outputCol='CountyIndexed') county_indexer = county_indexer.fit(data) data = date_indexer.transform(data) data = property_type_indexer.transform(data) data = olde_new_indexer.transform(data) data = town_indexer.transform(data) data = district_indexer.transform(data) data = county_indexer.transform(data) data.show assembler=VectorAssembler(inputCols=['Date_of_TransferIndexed', 'CountyIndexed'],outputCol='features') output=assembler.transform(data) final_data=output.select('features','Price') train_data,test_data=final_data.randomSplit([0.7,0.3]) lr=LinearRegression(labelCol='Price') lr_model=lr.fit(train_data) # save results filename = 'Machine_Learning' lr_model.save(os.path.join('Bucket'))
], outputCol='Attributes') output = assembler.transform(dataset) finalized_data = output.select("Attributes", dataset.columns[11]) finalized_data.show() valid_output = assembler.transform(validationdataset) valid_finalized_data = valid_output.select("Attributes", validationdataset.columns[11]) valid_finalized_data.show() # 80/20 split train / test train_data, test_data = finalized_data.randomSplit([0.8, 0.2]) regressor = LinearRegression(featuresCol='Attributes', labelCol=dataset.columns[11]) #Train mdoel with training split regressor = regressor.fit(train_data) pred = regressor.evaluate(test_data) #Predict the model pred.predictions.show() predictions = regressor.transform(valid_finalized_data) predictions.show() dataset.groupby("quality").count().show() # ################################################################################################################
ratingsPerDayDict = ratingsRDD.map(lambda x: x.split("\t")) \ .map(lambda x: daysSinceEpoch(int(x[3]))) \ .countByValue() # prepare data frame as required by MLLib data = spark.sparkContext.parallelize(ratingsPerDayDict.items()) \ .map(lambda x: (float(x[1]), Vectors.dense(float(x[0])))) df = data.toDF(["label", "features"]) # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create the linear regression model lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train the model using our training data model = lir.fit(trainingDF) # Generate predictions for test data using our linear regression model fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect() # Print out the predicted and actual values for each point
from pyspark.ml.regression import LinearRegression data=spark.read.csv('Ecommerce_Customers.csv',inferSchema = True,header=True) from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols = ['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='features') output = assembler.transform(data) final_data = output.select('features','Yearly Amount Spent') train_data,test_data = final_data.randomSplit([0.7,0.3]) lr = LinearRegression(labelCol ='Yearly Amount Spent' ) lr_model = lr.fit(train_data) print("-------------------------------------------------------------------------") test_results = lr_model.evaluate(test_data) print("************************************",test_results.rootMeanSquaredError) print(test_results.meanSquaredError) print(test_results.r2) end = time.time() print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ ",end-start)
# Convert this RDD to a DataFrame colNames = ["label", "features"] df = data.toDF(colNames) # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame. # Perhaps you're importing data from a real database. Or you are using structured streaming # to get your data. # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train the model using our training data model = lir.fit(trainingDF) # Now see if we can predict values in our test data. # Generate predictions using our linear regression model for all features in our # test dataframe: fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect()
#VECTORIZE TRAIN DATA energi_terbarukan_train = sc.textFile("train_terbarukan.txt") energi_terbarukan_train_labeled = energi_terbarukan_train.map(parse_train) energi_terbarukan_train_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_train_labeled["label", "features"]) print(energi_terbarukan_train_labeled_DF) #VECTORIZE TEST DATA energi_terbarukan_test = ssc.textFileStream("test_terbarukan.txt") energi_terbarukan_test_labeled = energi_terbarukan_test.map(parse_test) energi_terbarukan_test_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_test_labeled["label", "features"]) print(energi_terbarukan_train_labeled_DF) #Create Model numFeatures = 3 lr = LinearRegression(maxIter=50) lrModel = lr.fit(energi_terbarukan_train_labeled_DF) #see what the model do print("Coefficients: "+str(lrModel.coefficients)) print("Intercept: "+str(lrModel.intercept)) #Predict On the tested data predictions = lrModel.transform(energi_terbarukan_test_labeled_DF) predictions.select("prediction","label", "features").show() #Evaluate the predictions from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2") evaluator.evaluate(predictions)