예제 #1
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
예제 #2
0
 def train(self, rdd):
     """
     This ignores the optimizer parameter since it makes config difficult for Linear Regression.
     :return:  Trained model to be passed to test.
     """
     options = self.options
     if options.loss == "l2":
         if options.reg_type in ["none", "l1", "l2"]:
             return LinearRegressionWithSGD.train(data=rdd,
                                                  iterations=options.num_iterations,
                                                  step=options.step_size,
                                                  miniBatchFraction=1.0,
                                                  regParam=options.reg_param,
                                                  regType=options.reg_type)
         elif options.reg_type == "elastic-net":  # use spark.ml
             lr = MLLinearRegression(maxIter=options.num_iterations, regParam=options.reg_param,
                                     elasticNetParam=options.elastic_net_param)
             # TODO: Do not include time for conversion to DataFrame (but this currently matches
             #       the Scala tests)
             df = rdd.toDF()
             lrModel = lr.fit(df)
             return LinearRegressionModel(lrModel.weights, lrModel.intercept)
         else:
             raise Exception("GLMRegressionTest cannot run with loss = %s, reg_type = %s" \
                             % (options.loss, options.reg_type))
     else:
         raise Exception("GLMRegressionTest does not recognize loss: %s" % options.loss)
예제 #3
0
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
예제 #4
0
파일: tests.py 프로젝트: Bella-Lin/spark
 def test_linear_regression(self):
     lr = LinearRegression(maxIter=1)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr"
     lr.save(lr_path)
     lr2 = LinearRegression.load(lr_path)
     self.assertEqual(lr2.uid, lr2.maxIter.parent,
                      "Loaded LinearRegression instance uid (%s) did not match Param's uid (%s)"
                      % (lr2.uid, lr2.maxIter.parent))
     self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter],
                      "Loaded LinearRegression instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
예제 #5
0
    def test_linear_regression_with_huber_loss(self):

        data_path = "data/mllib/sample_linear_regression_data.txt"
        df = self.spark.read.format("libsvm").load(data_path)

        lir = LinearRegression(loss="huber", epsilon=2.0)
        model = lir.fit(df)

        expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
                                1.2612, -0.333, -0.5694, -0.6311, 0.6053]
        expectedIntercept = 0.1607
        expectedScale = 9.758

        self.assertTrue(
            np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
        self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
        self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
예제 #6
0
 def test_linear_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertAlmostEqual(s.r2adj, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     self.assertEqual(s.degreesOfFreedom, 1)
     devResiduals = s.devianceResiduals
     self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned
     # The child class LinearRegressionTrainingSummary runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
예제 #7
0
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=['client_port', 'Interval'],
                                  outputCol='Duration')
vdf = vectorAssembler.transform(newdf)
vdf = vdf.select(['features', 'Duration'])

## Dividing Dataframe into two parts
splits = newdf.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

## Building the model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features',
                      labelCol='Duration',
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

## Predictions
lr_predictions = lr_model.transform(test_df)

######### Solution to Problem 3 ###########
## This problem can be addressed similarly to how we predicted session length for an IP
## Here we will need to calculate unique URL visits instead so session duration, then a similar Linear Regression model can be used.

#### Aggregating data to Client_port, Interval and number of unique URL visits level.
df_date = df.withColumn('Date', split_col.getItem(0))
# Let's define our vector with only the features we actually want to use to build the model
# We'll ignore the columns above that are highly correlated to one another.

# Note that waterfront is treated as a boolean, so we didn't have to encode it.
# We can just add it to the vector assembler.
assembler = VectorAssembler(
    inputCols=["bedrooms", "bathrooms", "sqft_living", "sqft_above_percentage", "floors", "condition_vector", "grade_vector", "zipcode_vector", "waterfront"],
    outputCol="features")

# Build a Grid of Hyperparameters to test
# Here we build a Grid of hyperparameters so we can test all permutations

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
lr = LinearRegression()

paramGridBuilder = ParamGridBuilder()

paramGrid = paramGridBuilder\
  .addGrid(lr.regParam, [0.01, 0.1, 0.5])\
  .addGrid(lr.elasticNetParam, [0, 0.5, 1])\
  .build()

# Split data into Training and Testing chunks, and prepare to build model
# In order to test many hyperparameter combinations and choose the best-performing model, we use a
# TrainValidationSplit object.
# TrainValidationSplit requires us to supply 4 parameters:
#  1. An estimator.  This is the model builder we will use.  In our case, it is a LinearRegression object
#  2. An evaluator.  This tells how we want to evaluate results to determine which model is best.
#  3. An estimatorParamMaps.  This is the ParamGrid object with all the hyperparameter values
예제 #9
0
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

N_OF_CLUSTERS = 10  # number of clusters included
N_OF_TIME_SLOTS = 14400  # number of time slots that are being used for training
TIME_SLOTS_WITHIN_DAY = 144  # day is divided into that number of slots
FIRST_DAY_DAY_OF_WEEK = 3  # which day of the week was the first day of the year 2015 (0 - Monday, 1 - Tuesday, etc.)

spark = SparkSession.builder.master('spark://172.25.24.242:7077').getOrCreate()
sqlCtx = SQLContext(spark.sparkContext, spark)

invDemandCache.init(spark, sqlCtx)

assembler = VectorAssembler(inputCols=["day_of_week", "time_of_day"],
                            outputCol="features")
lr = LinearRegression(labelCol='demand')


def get_data(start_time, end_time, cluster):
    rows = []
    for tid in range(start_time, end_time):
        demand = invDemandCache.get_demand(tid, cluster)
        day_of_week = (FIRST_DAY_DAY_OF_WEEK + int(tid // 144)) % 7
        time_of_day = tid % 144
        rows.append((day_of_week, time_of_day, demand))
        if tid % 100 == 0:
            print(tid)
    df = spark.createDataFrame(rows, ["day_of_week", "time_of_day", "demand"])
    output = assembler.transform(df)
    return output.select('features', 'demand')
예제 #10
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark import SparkContext
from pyspark import SQLContext

sc = SparkContext("local","Simple App")
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame([
     (1.0, 2.0, Vectors.dense(1.0)),
     (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])

lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
model = lr.fit(df)
test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])

print abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
    stringIndexer = StringIndexer(inputCol=item, outputCol=item +
                                  ' index').fit(df).transform(df)
    encoder = OneHotEncoder(inputCol=item + ' index',
                            outputCol=item +
                            ' onehot').transform(stringIndexer).select(
                                'Id', item + ' onehot')
    df = df.drop(item)
    df_str = df_str.join(encoder,
                         'Id')  # the output of one hot encoding is a vector
    # unlike r or python, which ask input should be a matrix with
    # many columns. The each line of MLlib features input is a vector.
df = df.join(df_str, 'Id', 'inner')
df_price = df.select('Id', 'SalePrice')
df_variable = df.drop('SalePrice')

assembler = VectorAssembler(
    inputCols=df_variable.columns,
    outputCol='features')  # Assemble all vectors together as input
output = assembler.transform(df)
input_data = output.select('SalePrice', 'features')
input_data = input_data.selectExpr("SalePrice as label",
                                   'features as features')

lr = LinearRegression(maxIter=100, regParam=0,
                      elasticNetParam=0.8)  # linear model and parameters

# Fit the model
lrModel = lr.fit(input_data)  # model fit on data
print("Coefficients: " + str(lrModel.coefficients))  # print parameters
print("Intercept: " + str(lrModel.intercept))  # print intercept
    "followers", "friends", "favorited", "status_count", "region_id",
    "user_desc_rating", "count"
],
                            outputCol="feat_vector")
# assembler = VectorAssembler(inputCols=["region_id", "user_desc_rating", "count"], outputCol="feat_vector")
featured_data = assembler.transform(raw_data.na.fill(0))
featured_data = featured_data.filter(featured_data.user_desc_rating != 0.0)
train, test = featured_data.randomSplit([.8, .2], 0)

featuresScaler = StandardScaler(inputCol="feat_vector", outputCol="features")
featuresModel = featuresScaler.fit(train)
scTrain = featuresModel.transform(train)
scTest = featuresModel.transform(test)

# Train model
lr = LinearRegression(labelCol="tweet_rating")
lrModel = lr.fit(scTrain)

# Model and Training info
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

# Compute simple error
tested = lrModel.transform(scTest)
err = tested.select('prediction').subtract(tested.select('tweet_rating'))
df = spark.createDataFrame(input_data, ["label", "features"])

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)
scaled_df = scaler.transform(df)

# Transform the data in `df` with the scaler
train_data, test_data = scaled_df.randomSplit([.8, .2], seed=1234)

# Initialize `lr`
lr = LinearRegression(labelCol="label",
                      maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8)

# Fit the data to the model
linearModel = lr.fit(train_data)

#Lets run this on our test dataset
predicted = linearModel.transform(test_data)

# Extract the predictions and the "known" correct labels
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = predictions.zip(labels).collect()
예제 #14
0
#VECTORIZE TRAIN DATA
energi_nuclear_train = ssc.textFileStream("train_nuclear.txt")
energi_nuclear_train_labeled = energi_nuclear_train.map(parse_train)
energi_nuclear_train_labeled_DF = SQLContext.createDataFrame(energi_nuclear_train_labeled["label", "features"])
print(energi_nuclear_train_labeled_DF)

#VECTORIZE TEST DATA
energi_nuclear_test = ssc.textFileStream("test_nuclear.txt")
energi_nuclear_test_labeled = energi_nuclear_test.map(parse_test)
energi_nuclear_test_labeled_DF = SQLContext.createDataFrame(energi_nuclear_test_labeled["label", "features"])
print(energi_nuclear_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_nuclear_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_nuclear_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
def main(argv=None):
    if argv is None:
        inputs_train = sys.argv[1]
        inputs_test = sys.argv[2]

    conf = SparkConf().setAppName('sentiment-analysis-word2vec')
    sc = SparkContext(conf=conf)
    sqlCt = SQLContext(sc)

    #read train json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_train)
    train = text.select('overall',
                        'reviewText').withColumnRenamed('overall', 'label')
    train.cache()

    ## DATA PROCESSING PIPELINE
    # Split at whitespace and characters that are not letter
    tokenizer = RegexTokenizer(inputCol="reviewText",
                               outputCol="words",
                               pattern="\\P{Alpha}+")

    # stopword remover
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

    # Word2Vec Features - default: vector length 100
    word2Vec = Word2Vec(inputCol="filtered_words", outputCol="features")

    pipeline_data_processing = Pipeline(stages=[tokenizer, remover, word2Vec])
    model_data_processing = pipeline_data_processing.fit(train)
    train_processed = model_data_processing.transform(train)
    train.unpersist()
    train_processed.cache()

    ## ML PIPELINE
    # linear Regression Model
    lr = LinearRegression(maxIter=20, regParam=0.1)

    # FIT MODEL USING CROSS VALIDATION
    # Parameter grid for cross validation: numFeatures and regParam
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \
        .build()

    # 5-fold cross validation
    evaluator = RegressionEvaluator(metricName="rmse")
    crossval = CrossValidator(estimator=lr,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)

    # Run cross-validation, and choose the best set of parameters.
    model = crossval.fit(train_processed)

    # RMSE on train data
    prediction_train = model.transform(train_processed)
    rmse_train = evaluator.evaluate(prediction_train)
    train_processed.unpersist()

    ## EVALUATION ON TEST DATA
    #read test json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_test)
    test = text.select('overall',
                       'reviewText').withColumnRenamed('overall', 'label')
    test_processed = model_data_processing.transform(test)

    # Evaluate the model on test data
    prediction_test = model.transform(test_processed)
    rmse_test = evaluator.evaluate(prediction_test)

    # Print Result
    result = "MODEL WITH Word2Vec features:\n"
    result = result + "-Train RMSE: " + str(rmse_train) + "\n"
    result = result + "-Test RMSE: " + str(rmse_test) + "\n"
    print(result)
예제 #16
0
# COMMAND ----------

# MAGIC %md Spark MLLib supports both `regressors` and `classifiers`, in this example you will use linear regression.  Once you create the `regressor` you will train it, and it will return a `Model`. The `Model` will be the object you use to make predictions.
# MAGIC
# MAGIC * Create an instance of the `LinearRegression` algorithm called `lrModel`:
# MAGIC * Set the label column to "count"
# MAGIC * Set the features column to "features"
# MAGIC * Set the "ElasticNetParam" to 0.5 (this controlls the mix of l1 and l2 regularization--we'll just use an equal amount of each)
# MAGIC * Print the results of calling `explainParams` on `lrModel`.  This will show you all the possible parameters, and whether or not you have customized them.

# COMMAND ----------

from pyspark.ml.regression import LinearRegression

lrModel = LinearRegression()\
  .setLabelCol("count")\
  .setFeaturesCol("features")\
  .setElasticNetParam(0.5)

print("Printing out the model Parameters:")
print("-" * 20)
print(lrModel.explainParams())
print("-" * 20)

# COMMAND ----------

# MAGIC %md
# MAGIC * Use the `fit` method on `lrModel` to provide the `training` dataset for fitting.
# MAGIC * Store the results in `lrFitted`.

# COMMAND ----------
예제 #17
0
Statistics.corr(usdVectors)
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

    
usdLP = usdVectors.map(transformationLR.transformToLabeledPoint)
usdDF = sqlContext.createDataFrame(usdLP, ["label", "features"])
usdDF.select("label", "features").show(10)

#Split into training and testing data
(trainingData, testData) = usdDF.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()

#Build the model on training data
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

#Predict on the test data
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)
#Streaming data

from pyspark.streaming import StreamingContext
ssc=StreamingContext(sc,1)
df = spark.read.load("/data/regression")


# COMMAND ----------

from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print lr.explainParams()
lrModel = lr.fit(df)


# COMMAND ----------

summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2


# COMMAND ----------

from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
예제 #19
0
    pred = d_copy['success_metric']
    d.pop('success_metric', None)
    values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data
    return (pred, Vectors.dense(values))

# training set
trainParsed = sc.parallelize(map(parsePoint, train_dict))
# test set
testParsed = sc.parallelize(map(parsePoint, test_dict))


## create validation set

trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"])
testDf = sqlContext.createDataFrame(testParsed, ["label", "features"])
lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
lm_model_fit = lm_model.fit(trainDf)
lm_transform = lm_model_fit.transform(trainDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression training Mean Squared Error = " + str(MSE))

lm_transform = lm_model_fit.transform(testDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression testing Mean Squared Error = " + str(MSE))

res = results.collect()
predsAndLabels = sc.parallelize([i.asDict().values() for i in res])
metrics = RegressionMetrics(predsAndLabels)
예제 #20
0
# Load the JSON strings as a Spark Dataframe.
natality_data = spark.read.json(table_json)
# Create a view so that Spark SQL queries can be run against the data.
natality_data.createOrReplaceTempView("natality")


# As a precaution, run a query in Spark SQL to ensure no NULL values exist.
sql_query = """
SELECT *
from natality
where weight_pounds is not null
and mother_age is not null
and father_age is not null
and gestation_weeks is not null
"""
clean_data = spark.sql(sql_query)

# Create an input DataFrame for Spark ML using the above function.
training_data = clean_data.rdd.map(vector_from_inputs).toDF(["label",
                                                             "features"])
training_data.cache()

# Construct a new LinearRegression object and fit the training data.
lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")
model = lr.fit(training_data)
# Print the model summary.
print "Coefficients:" + str(model.coefficients)
print "Intercept:" + str(model.intercept)
print "R^2:" + str(model.summary.r2)
model.summary.residuals.show()
예제 #21
0
# ## Requerimeintos de afinamiento de hyper parametros 


# Necesitamos especificar cuatro componentes para realizar el ajuste de hiperparámetros usando
# busqueda de la malla:
# * Estimator (i.e. machine learning algorithm)
# * Hyperparameter grid
# * Evaluator
# * Validation method


# ## Specify the estimator

# En este ejemplo usaremos la regresion lineal lasso para nuestro estimador :
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="star_rating", elasticNetParam=1.0)

# Usar el metodo  `explainParams`  para ver la lista de los hiperparametros:
print(lr.explainParams())

# Configurar  `elasticNetParam=1.0` corresponde al modelo Lasso  $l1$ (lasso) de regresion linear 
# Queremos encontrar un valor razonable de ese parametro que esta en el objeto `regParam`.
# [Elastic_net](https://en.wikipedia.org/wiki/Elastic_net_regularization)

# ## Especificar un grid de parametros 
# 

# usar la clase para especificar el grid (malla) de hiperparametros
# [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder)
from pyspark.ml.tuning import ParamGridBuilder
regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
                    ZN_,
                    price_
                    FROM temp_sql_table """)
print (spark_sql_output.take(10))

trainingData=spark_sql_output.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData.show()
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

(trainingData, testData) = trainingData.randomSplit([0.7, 0.3])

#################### SPARK ML  ####################

# Define LinearRegression algorithm
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(trainingData, {lr.regParam:0.0})
modelB = lr.fit(trainingData, {lr.regParam:100.0})

# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)

predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
# copy data from a local disk to HDFS
## old hadoop fs -put ./spark/data/mllib/ridge-data/lpsa.data /user/hadoop/lpsa.data
#$ hadoop fs -put ./spark/data/mllib/sample_linear_regression_data.txt /user/hadoop/
# Load training data
#data = spark.read.format("libsvm")\
#    .load("sample_linear_regression_data.txt")
# or read it from a local disk (if working with a local Spark)

data = spark.read.format("libsvm")\
    .load("file:///home/hadoop/spark/data/mllib/sample_linear_regression_data.txt")

# split into training and test data
(train, test) = data.randomSplit([0.7, 0.3])

lr = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)


# Fit the model
lrModel = lr.fit(train)

print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
# Used to help if LR systematically over and under-predicts the data (bias)
trainingSummary.residuals.show()
# Root Mean Squared Error (RMSE) on test data
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
예제 #24
0
#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)

#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
예제 #25
0
# ## Load Dataset

# Loading the data set with the spark method.

# In[3]:

# Load training data
df = spark.read.format("csv").option("header", "true").load("data/train.csv")

# ## Preparing Data

# Computing the LinearRegression model

# In[4]:

lr = LinearRegression()

# It's crutial to see the type of the attributes to check if it's needed to process some categorical attributes, but not in this case.

# In[5]:

df.dtypes

# It's necessary to cast all the attributes to float because it's one of the datatypes that Spark works with, because Spark doesn't work with strings.

# In[6]:

df = df.select(
    df['fare_amount'].cast("float").alias('fare_amount'),
    df['pickup_longitude'].cast("float").alias('pickup_longitude'),
    df['pickup_latitude'].cast("float").alias('pickup_latitude'),
예제 #26
0
    def _train_model_spark(self, data):
        df = self._prepare_data_spark(data)
        input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE,
                                                self.TODAY_PRICE}))

        if self.ann_hidden_nodes_num is None:
            self.ann_hidden_nodes_num = input_num / 2 + 1
        ann_layers = [input_num,
                      # input_num / 3 * 2,
                      # input_num / 3,
                      self.ann_hidden_nodes_num,
                      2]

        self.logger.info('layer settings are {}'.format(ann_layers))
        self.logger.info('training method is {}'.format(self._train_method))
        self.logger.info('trees num is {}'.format(self.random_forest_tree_number))
        if isinstance(self._train_method, dict):
            if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                self._model[self.CHANGE_AMOUNT].stop_server()
            self._model = {self.CHANGE_AMOUNT: None,
                           self.CHANGE_DIRECTION: None}

            if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                      maxIter=self.linear_regression_training_times,
                                      regParam=self.linear_regression_regularization_parameter,
                                      predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = lr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT,
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth,
                                            predictionCol='AmountPrediction')
                self._model[self.CHANGE_AMOUNT] = rfr.fit(df)
            elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                                          num_workers=self.spark_worker_numbers,
                                                                          epoch=self.ann_epoch_number,
                                                                          featuresCol="features",
                                                                          labelCol=self.CHANGE_AMOUNT,
                                                                          predictionCol='AmountPrediction'
                                                                          )
                self._model[self.CHANGE_AMOUNT].fit(df)
            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

            if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION:
                lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                        maxIter=self.logistic_regression_training_times,
                                        regParam=self.linear_regression_regularization_parameter,
                                        predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = lr.fit(df)
            elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST:
                rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION,
                                             numTrees=self.random_forest_tree_number,
                                             maxDepth=self.random_forest_tree_max_depth,
                                             predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = rfc.fit(df)

            elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 2
                mlpc = MultilayerPerceptronClassifier(featuresCol="features",
                                                      labelCol=self.CHANGE_DIRECTION,
                                                      layers=ann_layers,
                                                      predictionCol='DirPrediction')
                self._model[self.CHANGE_DIRECTION] = mlpc.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        else:
            if self._train_method == self.LINEAR_REGRESSION:
                lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction',
                                      regParam=self.linear_regression_regularization_parameter,
                                      maxIter=self.linear_regression_training_times)
                self._model = lr.fit(df)
            elif self._train_method == self.RANDOM_FOREST:
                rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE,
                                            predictionCol='prediction',
                                            numTrees=self.random_forest_tree_number,
                                            maxDepth=self.random_forest_tree_max_depth)
                self._model = rfr.fit(df)

            elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK:
                ann_layers[-1] = 1
                if self._model is not None:
                    self._model.stop_server()
                self.logger.warn('layers are {}'.format(ann_layers))
                self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark,
                                                      num_workers=self.spark_worker_numbers, epoch=100,
                                                      featuresCol="features", labelCol=self.TARGET_PRICE,
                                                      predictionCol='prediction'
                                                      )
                self._model.fit(df)

            else:
                self.logger.warn('Unsupported training method {}'.format(self._train_method))
                raise ValueError('Unsupported training method {}'.format(self._train_method))

        return self._model
예제 #27
0
# Fit the DataFrame to the scaler

ini = time.time()

scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df_teste = scaler.transform(df)

# Inspect the result
#scaled_df.take(2)

from pyspark.ml.regression import LinearRegression

# Initialize `lr`
lr = LinearRegression(labelCol="label", maxIter=1)

# Fit the data to the model
linearModel = lr.fit(scaled_df_treino)
predicted = linearModel.transform(scaled_df_teste)

# Extract the predictions and the "known" correct labels
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])

# Zip `predictions` and `labels` into a list
predictionAndLabel = predictions.zip(labels).collect()

fim = time.time()

print("TEMPO###################")
예제 #28
0
spark = spark = SparkSession.builder.appName('LineerRegresyon').getOrCreate()
veri = spark.read.csv('Ecommerce Customers.csv', inferSchema=True, header=True)
veri.printSchema()
veri.show()
veri.head()
veri.show()
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'Avg Session Length', 'Time on App', 'Time on Website',
    'Length of Membership'
],
                            outputCol='features')
VeriVec = assembler.setHandleInvalid("skip").transform(veri)
VeriVec.show()
VeriVec.printSchema()

SonVeri = VeriVec.select('features', 'Yearly Amount Spent')
egitimVeri, testVeri = SonVeri.randomSplit([0.6, 0.4])
egitimVeri.show()

lr = LinearRegression(labelCol='Yearly Amount Spent')
lrModel = lr.fit(egitimVeri)
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,
                                              lrModel.intercept))
sonuclar = lrModel.evaluate(testVeri)
sonuclar.residuals.show()
print("RMSE: {}".format(sonuclar.rootMeanSquaredError))
print("MSE: {}".format(sonuclar.meanSquaredError))
예제 #29
0
model_df = features_df.select('features', 'price')  # 构建用于线性回归的数据模型

# 5-将数据划分为 训练数据和预测数据
train_df, test_df = model_df.randomSplit([0.7, 0.3])  # 训练数据和预测数据的比例为 7比3

print((train_df.count(), len(train_df.columns)))

print((test_df.count(), len(test_df.columns)))

# 6-构建线性回归模型

from pyspark.ml.regression import LinearRegression  # 导入线性回顾库

print('-------------- 构建线性回归模型 ------------------')

lin_Reg = LinearRegression(labelCol='price')  # labelCol,相对于featrues列,表示要进行预测的列

lr_model = lin_Reg.fit(
    train_df)  # 训练数据 ,fit返回一个 fitted model,即LineRegressionModel对象

print('{}{}'.format('方程截距:', lr_model.intercept))  # intercept 线性方程的截距。

print('{}{}'.format(
    '方程参数系数:',
    lr_model.coefficients))  # 回归方程中的,变量参数 ,这里分别对应var_1,var_2,var_3,var_4,var_5

training_predictions = lr_model.evaluate(train_df)  # 查看预测数据

print('{}{}'.format('误差差值平方:',
                    training_predictions.meanSquaredError))  # 误差值差值平方
예제 #30
0
def sliding_window_evaluation(dataframe, feature_columns, num_windows=5, test_size=0.2):
    '''
    Takes an input dataframe, splits it into partitions, and performs a sliding window where
    each partition is split between a train/test set and a linear regression is trained
    and evaluated
    
    Meant for analyzing the performance of a time series regression forecasting model as a random
    split is not appropriate in a time series setting
    '''
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.regression import LinearRegression
    from pyspark.ml.evaluation import RegressionEvaluator
    
    # Gathering statistics for window partitions and train/test splits
    total_rows = dataframe.count()
    window_size = round(total_rows / num_windows)
    num_training_rows = round((dataframe.count() * (1 - test_size)) / num_windows)

    # Creating a column for partition numbers
    dataframe = (dataframe.withColumn('window_num', ((sqlF.row_number().over(Window.orderBy('date_time_resampled')) - 1) / window_size) + 1)
                          .withColumn('window_num', sqlF.floor(col('window_num'))))  # Truncating to integers
    
    # Specifying the name of the column containing the label
    labelColumn = 'price'

    # Assembling the vectors and outputting the training set
    assembler = VectorAssembler(
        inputCols=feature_columns,
        outputCol='features')
    output = assembler.transform(dataframe)
    vectorizedDF = output.select('features', col(labelColumn).alias('label'), 'window_num')
    
    # Gathering the total RMSE from all windows
    total_RMSE = []
    
   # Looping over windows, splitting into train/test sets, and training and evaluating a model on each set
    for window in range(1, num_windows+1):
        
        # Subsetting the dataframe into the window
        dataWindow = vectorizedDF.filter(col('window_num') == window).drop('window_num')

        # Splitting into train/testing sets
        trainWindow = sqlContext.createDataFrame(dataWindow.head(num_training_rows), dataWindow.schema)
        testWindow = dataWindow.subtract(trainWindow)
        
        # Fitting the model
        # Using L1 regularization for automatic feature selection
        lr = LinearRegression(elasticNetParam=1.0, regParam=0.03)
        model = lr.fit(trainWindow)
    
        # Gathering evaluation and summary metrics
        modelSummary = model.summary
        
        # Creating a plot of the predictions and actuals to see if there is a significant lag
        predictDF = model.transform(testWindow)  # Generating predictions
        total_RMSE.append(testRMSE)
        fig, ax = plt.subplots()
        ax.plot(predictDF.select('label').collect(), label='Label')
        ax.plot(predictDF.select('prediction').collect(), label='Prediction')
        plt.legend()
        plt.title('Test Set: Predictions and Actuals')
        
        # Reporting results
        print('Window', window)
        print('Training Size:', trainWindow.count())
        print('Testing Size:', testWindow.count())
        print("r2: %f" % modelSummary.r2)
        print("Training RMSE: %f" % modelSummary.rootMeanSquaredError)
        plt.show()  # Plot of actuals vs predictions
        print()
        
    print('Average RMSE for {0} windows: {1}'.format(num_windows, np.mean(total_RMSE)))
sc = SparkContext('local', 'lineregression')

sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(fes)

df.select(["features", "labels"]).toPandas()

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=['features'], outputCol='feature')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['feature', 'labels'])
vhouse_df.show(3)

lr = LinearRegression(featuresCol='feature',
                      labelCol='labels',
                      maxIter=100,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(vhouse_df.randomSplit([0.9, 0.1])[0])
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

a = float(str(lr_model.coefficients[0]))
b = float(str(lr_model.intercept))
import matplotlib.pyplot as plt

plt.scatter(matrix[:, 0], matrix[:, 1])

plt.plot(matrix[:, 0], a * matrix[:, 0] + b, color='red')
예제 #32
0
 def test_write_property(self):
     lr = LinearRegression(maxIter=1)
     self.assertTrue(isinstance(lr.write, MLWriter))
예제 #33
0
output = assembler.transform(indexed)

output.show()

output.select('features', 'crew').show()

final_data = output.select('features', 'crew')

final_data.describe().show()

train_data, test_data = final_data.randomSplit([0.7, 0.3])

train_data.describe().show()

test_data.describe().show()
lr = LinearRegression(labelCol='crew')

lrmodel = lr.fit(train_data)

print("Coefficients {} Intercept{}".format(lrmodel.coefficients,
                                           lrmodel.intercept))

test_results = lrmodel.evaluate(test_data)

print("RMSE{}".format(test_results.rootMeanSquaredError))
print("R2{}".format(test_results.r2))

shipdf.select(corr('crew', 'passengers')).show()

spark.stop()

udf_strpTime_features = udf(strpDate_features, types.IntegerType())
udf_strpTime_trainlabel = udf(strpDate_trainlabel, types.IntegerType())

df_features = df_features.withColumn(
    'realdate', udf_strpTime_features(df_features['date'])).drop('date')
df_train_label = df_train_label.withColumn(
    'realdate', udf_strpTime_trainlabel(df_train_label['date'])).drop('date')

df_new = df_train_label.join(df_features, 'realdate')
df_new = df_new.na.fill(0.0)

train, validation = df_new.randomSplit([0.80, 0.20])

assembler = VectorAssembler(inputCols=[
    'realdate', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10',
    'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20',
    'e21', 'e22', 'e23', 'e24', 'e25', 'e26'
],
                            outputCol='features')

lr = LinearRegression(featuresCol='features', labelCol='label')
pipeline = Pipeline(stages=[assembler, lr])

model = pipeline.fit(train)
prediction = model.transform(validation)
evaluator = RegressionEvaluator(predictionCol='prediction')
res = evaluator.evaluate(prediction, {evaluator.metricName: 'mse'})
print(res)
# Get the count for distinct output classes
distinct_classes = predictions.select("prediction").distinct()
distinct_classes_count = distinct_classes.count()
print("Number of Distinct classes:", distinct_classes_count)

all_data_through_model = trained_model.transform(model_data)
(train_data, test_data) = all_data_through_model.randomSplit([0.8, 0.2])

dict_lin_reg = {}
best_lin_reg = {}
output = {}
for i in distinct_classes.collect():
    print("Currently running for:", i[0])
    required_dataframe = train_data.filter(
        train_data.prediction == i[0]).drop("prediction")
    temp_lin_reg = LinearRegression().setFeaturesCol("features").setLabelCol(
        "trip_duration")
    grid_builder = ParamGridBuilder() \
        .addGrid(temp_lin_reg.regParam,[0.5,1,100,1000]) \
        .addGrid(temp_lin_reg.elasticNetParam,[0.2,0.5,0.8,1]) \
        .addGrid(temp_lin_reg.epsilon,[2,3,5,9,50]) \
        .addGrid(temp_lin_reg.maxIter,[10, 20, 50, 75]) \
        .build()
    cross_validator = CrossValidator(estimator=temp_lin_reg,
                                     estimatorParamMaps=grid_builder,
                                     evaluator=rmseEvaluator,
                                     numFolds=10)
    cv_model = cross_validator.fit(required_dataframe)
    dict_lin_reg[i[0]] = cv_model
    best_lin_reg[i[0]] = cv_model.bestModel
    output[i[0]] = best_lin_reg[i[0]].transform(
        test_data.filter(test_data.prediction == i[0]).drop("prediction"))
예제 #36
0
# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Prepare training and test data.
data = spark.read.format("libsvm") \
    .load("sample_linear_regression_data.txt")
train, test = data.randomSplit([0.9, 0.1], seed=12345)

lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(
    estimator=lr,
    estimatorParamMaps=paramGrid,
# Define input path
input_path = "C:\\Users\\Lenovo\\PycharmProjects\\M2_ICP7"

# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option(
    "inferSchema", True).option("delimiter",
                                ",").load(input_path + "\\car.csv")
data = data.withColumnRenamed("wheel-base",
                              "label").select("label", "length", "width",
                                              "height")

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
model = lr.fit(data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
예제 #38
0
################################ ML ###################################
from pyspark.ml.linalg import Vectors
test3 = test1.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(
    ['Length', 'Speed'])
test3.show(5)

# In[ ]:

from pyspark.ml.regression import LinearRegression

# Load training data
##training = spark.read.format("libsvm")\
##    .load("data/mllib/sample_linear_regression_data.txt")

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
##lrModel = lr.fit(training)
lrModel = lr.fit(test2)
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
예제 #39
0
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.regression import LabeledPoint

data= [LabeledPoint(0.0, Vectors.dense([0.0]),), LabeledPoint(0.99, Vectors.dense([1.0])), LabeledPoint(2.0, Vectors.dense([2.0])), LabeledPoint(3.01, Vectors.dense([3.0]))]
training = sqlContext.createDataFrame(data)

lr = LinearRegression(maxIter=100, regParam=0.05, elasticNetParam=0.8)
lrModel = lr.fit(training)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

예제 #40
0
                StructField(
                    "house_pricing",
                    ArrayType(
                        StructType([
                            StructField("beds", IntegerType(), False),
                            StructField("baths", IntegerType(), False),
                            StructField("sq__ft", IntegerType(), False),
                            StructField("price", IntegerType(), False)
                        ])))
            ]))
    ])

    df = (spark.createDataFrame(collection, sch).select(
        explode(col("capabilities.{0}".format(
            "house_pricing"))).alias("house_pricing")).withColumn(
                "beds", col("house_pricing.beds")).withColumn(
                    "baths", col("house_pricing.baths")).withColumn(
                        "sq__ft", col("house_pricing.sq__ft")).withColumn(
                            "price", col("house_pricing.price")))

    assembler = VectorAssembler(inputCols=["beds", "baths", "sq__ft"],
                                outputCol="features")
    assembled_df = assembler.transform(df)
    lr = LinearRegression(
        maxIter=10).setLabelCol("price").setFeaturesCol("features")
    model = lr.fit(assembled_df)
    test_df = spark.createDataFrame((([1., 1., 70.]), ),
                                    ["beds", "baths", "sq__ft"])
    assembled_test_df = model.transform(assembler.transform(test_df))
    assembled_test_df.show(truncate=False)
예제 #41
0
# In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret.

# In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data.

# You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation.

# The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test.

# Instructions
# 100 XP
# Fit a linear regression model to the training data.
# Calculate the RMSE on the testing data.
# Look at the model coefficients.
# Get the count of coefficients equal to 0.
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([beta == 0 for beta in regression.coefficients])
print("Number of ceofficients equal to 0:", zero_coeff)
def main():
    st_time = datetime.now()  #Start time
    # Start model training
    print('Model training has started...')
    beneficiary = spark.read.parquet(*beneficiary_files)
    inpatient = spark.read.parquet(*inpatient_files)
    inpatient = inpatient.fillna({'CLM_FROM_DT': '2008-01-01'})
    outpatient = spark.read.parquet(*outpatient_files)
    beneficiary.printSchema()
    inpatient.printSchema()
    outpatient.printSchema()

    ben = beneficiary.select(
        col('DESYNPUF_ID').alias('PATIENT_ID'),
        'BENE_BIRTH_DT',
        col('BENE_SEX_IDENT_CD').alias("GENDER"),
        col('BENE_RACE_CD').alias('RACE'),
        col('SP_STATE_CODE').alias('STATE'),
        'SP_ALZHDMTA',
        'SP_CHF',
        'SP_CHRNKIDN',
        'SP_CNCR',
        'SP_COPD',
        'SP_DEPRESSN',
        'SP_DIABETES',
        'SP_ISCHMCHT',
        'SP_OSTEOPRS',
        'SP_RA_OA',
        'SP_STRKETIA',
        col('BENRES_IP').alias("ANNUAL_COST"),
    )

    inp = inpatient.groupBy('DESYNPUF_ID').agg(
        count(when(col('ICD9_DGNS_CD_1') != 'nan', True)).alias('DX'),
        count(when(col('ICD9_PRCDR_CD_1') != 'nan', True)).alias('PX'),
        count(when(col('HCPCS_CD_1') != 'nan', True)).alias('HCPCS'),
        max("CLM_FROM_DT").alias("DATE"),
    )

    inner_join = ben.join(inp, ben.PATIENT_ID == inp.DESYNPUF_ID, how='inner')

    timeDiff = (unix_timestamp('DATE', "yyyy-MM-dd HH:mm:ss") -
                unix_timestamp('BENE_BIRTH_DT', "yyyy-MM-dd HH:mm:ss"))
    inner_join = inner_join.withColumn("AGE_YRS",
                                       timeDiff / 60 / 60 / 24 / 365)
    df = inner_join.select('ANNUAL_COST', 'PATIENT_ID', 'AGE_YRS', 'GENDER',
                           'RACE', 'STATE', 'DX', 'PX', 'HCPCS', 'SP_ALZHDMTA',
                           'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
                           'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT',
                           'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA')
    df.filter(col("ANNUAL_COST") != 0).show()

    cat_cols = [
        'GENDER', 'RACE', 'STATE', 'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN',
        'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT',
        'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA'
    ]
    indexers = [
        StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
        for column in cat_cols
    ]

    pipeline = Pipeline(stages=indexers)
    indexed = pipeline.fit(df).transform(df)
    indexed.show()

    # creating vectors from features
    # Apache MLlib takes input if vector form
    assembler = VectorAssembler(inputCols=[
        'AGE_YRS', 'GENDER_index', 'RACE_index', 'STATE_index', 'DX', 'PX',
        'HCPCS', 'SP_ALZHDMTA_index', 'SP_CHF_index', 'SP_CHRNKIDN_index',
        'SP_CNCR_index', 'SP_COPD_index', 'SP_DEPRESSN_index',
        'SP_DIABETES_index', 'SP_ISCHMCHT_index', 'SP_OSTEOPRS_index',
        'SP_RA_OA_index', 'SP_STRKETIA_index'
    ],
                                outputCol='features')
    output = assembler.transform(indexed)
    output.select('features', 'ANNUAL_COST').show(5)

    # output as below
    train_data, test_data = output.randomSplit([0.7, 0.3])
    test_data.describe().show()

    # creating an object of class LinearRegression
    # object takes features and label as input arguments
    synpuf_lr = LinearRegression(featuresCol='features',
                                 labelCol='ANNUAL_COST')

    # pass train_data to train model
    trained_synpuf_model = synpuf_lr.fit(train_data)

    # evaluating model trained for Rsquared error
    synpuf_results = trained_synpuf_model.evaluate(train_data)

    print('Rsquared Error :', synpuf_results.r2)

    unlabeled_data = test_data.select('features')
    unlabeled_data.show(5)

    print('Processing predictions...')
    predictions = trained_synpuf_model.transform(unlabeled_data)
    predictions.show()

    test_df = test_data.toPandas()
    pred_df = predictions.toPandas()

    merged_df = test_df.merge(pred_df)
    merged_df.sort_values(by='prediction', ascending=False, inplace=True)
    merged_df.drop_duplicates(subset=merged_df.columns[1:9], inplace=True)
    merged_df.astype({'features': str, 'DX': 'int32', 'PX': 'int32', 'HCPCS': 'int32'}) \
            .to_parquet(dir + "synpuf_ml_output.parquet", index=False)
    s3.upload_file(dir + "synpuf_ml_output.parquet", "cms-data-1",
                   "Annual_Cost_Predictions/synpuf_ml_output.parquet")
    os.remove(dir + "synpuf_ml_output.parquet")

    print('\n Predictions Completed!')
    fin_time = datetime.now()
    execution_time = fin_time - st_time
    print('\n Total execution time: {0}'.format(str(execution_time)))
    logging(execution_time)

    return
예제 #43
0
date_indexer = StringIndexer(inputCol='Date of Transfer', outputCol='Date_of_TransferIndexed')
date_indexer = date_indexer.fit(data)
property_type_indexer = StringIndexer(inputCol='Property Type', outputCol='Property_typeIndexed')
property_type_indexer = property_type_indexer.fit(data)
olde_new_indexer = StringIndexer(inputCol='Old/New', outputCol='Old_NewIndexed')
olde_new_indexer = olde_new_indexer.fit(data)
town_indexer = StringIndexer(inputCol='Town/City', outputCol='TownIndexed')
town_indexer = town_indexer.fit(data)
district_indexer = StringIndexer(inputCol='District', outputCol='DistrictIndexed')
district_indexer = district_indexer.fit(data)
county_indexer = StringIndexer(inputCol='County', outputCol='CountyIndexed')
county_indexer = county_indexer.fit(data)
data = date_indexer.transform(data)
data = property_type_indexer.transform(data)
data = olde_new_indexer.transform(data)
data = town_indexer.transform(data)
data = district_indexer.transform(data)
data = county_indexer.transform(data)
data.show
assembler=VectorAssembler(inputCols=['Date_of_TransferIndexed', 'CountyIndexed'],outputCol='features')
output=assembler.transform(data)
final_data=output.select('features','Price')
train_data,test_data=final_data.randomSplit([0.7,0.3])

lr=LinearRegression(labelCol='Price')
lr_model=lr.fit(train_data)

# save results
filename = 'Machine_Learning'
lr_model.save(os.path.join('Bucket'))
예제 #44
0
],
                            outputCol='Attributes')
output = assembler.transform(dataset)

finalized_data = output.select("Attributes", dataset.columns[11])
finalized_data.show()

valid_output = assembler.transform(validationdataset)

valid_finalized_data = valid_output.select("Attributes",
                                           validationdataset.columns[11])
valid_finalized_data.show()

# 80/20 split train / test
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])
regressor = LinearRegression(featuresCol='Attributes',
                             labelCol=dataset.columns[11])

#Train mdoel with training split
regressor = regressor.fit(train_data)

pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

predictions = regressor.transform(valid_finalized_data)
predictions.show()

dataset.groupby("quality").count().show()

# ################################################################################################################
ratingsPerDayDict = ratingsRDD.map(lambda x: x.split("\t")) \
                    .map(lambda x: daysSinceEpoch(int(x[3]))) \
                    .countByValue()

# prepare data frame as required by MLLib
data = spark.sparkContext.parallelize(ratingsPerDayDict.items()) \
        .map(lambda x: (float(x[1]), Vectors.dense(float(x[0]))))
df = data.toDF(["label", "features"])

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

# Now create the linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model using our training data
model = lir.fit(trainingDF)

# Generate predictions for test data using our linear regression model 
fullPredictions = model.transform(testDF).cache()

# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

# Print out the predicted and actual values for each point
from pyspark.ml.regression import LinearRegression

data=spark.read.csv('Ecommerce_Customers.csv',inferSchema = True,header=True)

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='features')

output = assembler.transform(data)

final_data = output.select('features','Yearly Amount Spent')

train_data,test_data = final_data.randomSplit([0.7,0.3])

lr = LinearRegression(labelCol ='Yearly Amount Spent' )

lr_model = lr.fit(train_data)
print("-------------------------------------------------------------------------")
test_results = lr_model.evaluate(test_data)

print("************************************",test_results.rootMeanSquaredError)

print(test_results.meanSquaredError)

print(test_results.r2)

end = time.time()
print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ ",end-start)

    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Train the model using our training data
    model = lir.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()
#VECTORIZE TRAIN DATA
energi_terbarukan_train = sc.textFile("train_terbarukan.txt")
energi_terbarukan_train_labeled = energi_terbarukan_train.map(parse_train)
energi_terbarukan_train_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_train_labeled["label", "features"])
print(energi_terbarukan_train_labeled_DF)

#VECTORIZE TEST DATA
energi_terbarukan_test = ssc.textFileStream("test_terbarukan.txt")
energi_terbarukan_test_labeled = energi_terbarukan_test.map(parse_test)
energi_terbarukan_test_labeled_DF = SQLContext.createDataFrame(energi_terbarukan_test_labeled["label", "features"])
print(energi_terbarukan_train_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_terbarukan_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_terbarukan_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)