Exemplo n.º 1
0
def build_and_save_model(df, config):
    #  Feature engineering
    categorical_cols = config["categorical_cols"]
    numerical_cols = config["numerical_cols"]
    indexed_cols = [c + '_index' for c in categorical_cols]
    encoded_cols = [c + '_dummy' for c in categorical_cols]

    indexer = StringIndexer(inputCols=categorical_cols,
                            outputCols=indexed_cols,
                            handleInvalid='keep')
    onehotencoder = OneHotEncoder(inputCols=indexed_cols,
                                  outputCols=encoded_cols)
    assembler = VectorAssembler(inputCols=encoded_cols, outputCol='features')
    standardscaler = StandardScaler(inputCol='features',
                                    outputCol='normalized_features',
                                    withMean=True)
    pca = PCA(inputCol='normalized_features', outputCol='pca_features')

    #Tune Model

    model_selector = 'RandomForest'

    if model_selector == 'ElasticNet':
        regressor = LinearRegression(featuresCol='pca_features')
        grid = ParamGridBuilder() \
            .addGrid(pca.k,[2,5,10]) \
            .addGrid(regressor.maxIter,[10]) \
            .addGrid(regressor.regParam,[0.1,0.3]) \
            .addGrid(regressor.elasticNetParam,[0.0,0.3,0.5,0.8,1.0]) \
            .build()
        stages = [
            indexer, onehotencoder, assembler, standardscaler, pca, regressor
        ]

    if model_selector == 'DecisionTree':
        regressor = DecisionTreeRegressor(featuresCol='pca_features')
        grid = ParamGridBuilder() \
            .addGrid(pca.k,[2,10,20]) \
            .addGrid(regressor.maxDepth,[5,10,20]) \
            .addGrid(regressor.maxBins,[10]) \
            .build()
        stages = [
            indexer, onehotencoder, assembler, standardscaler, pca, regressor
        ]

    if model_selector == 'RandomForest':
        regressor = RandomForestRegressor(featuresCol='features')
        grid = ParamGridBuilder() \
            .addGrid(regressor.maxDepth,[2,5,10]) \
            .addGrid(regressor.maxBins,[10]) \
            .addGrid(regressor.numTrees,[20]) \
            .build()
        stages = [indexer, onehotencoder, assembler, regressor]

    if model_selector == "GradientBoosting":
        regressor = GBTRegressor(featuresCol='features')
        grid = ParamGridBuilder() \
            .addGrid(regressor.maxDepth,[2,5]) \
            .addGrid(regressor.maxBins,[8]) \
            .addGrid(regressor.maxIter,[10]) \
            .build()
        stages = [indexer, onehotencoder, assembler, regressor]

    pipeline = Pipeline(stages=stages)

    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=grid,
                        evaluator=RegressionEvaluator(metricName='mae'),
                        numFolds=10,
                        parallelism=3,
                        seed=123)

    model = cv.fit(df)

    print('mean absolute error: %s' % model.avgMetrics)
    print(model.bestModel.stages[-1])
    model.write().overwrite().save(config["model_path"])
Exemplo n.º 2
0
    "number_blocks"
],
                                  outputCol='features')
v_df = vectorAssembler.transform(df)
#v_df.show(3)

#train test split
splits = v_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features',
                      labelCol='value_usd',
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)  #schema for Model
lr_model = lr.fit(train_df)  #train model on train_df

lr_predictions = lr_model.transform(
    test_df)  #use model to make predictions on test_df
lr_predictions.select("prediction",
                      "value_usd", "features", "date").toPandas().to_csv(
                          "predLR.csv")  #safe predictions to local drive

#Random Forest
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data_full = sc.textFile(
Exemplo n.º 3
0
    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

    # Train the model using our training data
    model = lir.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()
Exemplo n.º 4
0
for i in range(len(finl)):               # to remove duplicates from join
	if finl[i]['Id'] not in keyl:
		finll.append(finl[i])
		keyl.append(finl[i]['Id'])
df_fin=sqlcontext.createDataFrame(finll)
df_fin.show()

#print(finll)
for col in df_fin.columns:
    if col=="rating":
        df_fin = df_fin.withColumn(col,df_fin[col].cast('float'))
assembler=VectorAssembler(inputCols=['age'],outputCol='features') 
output=assembler.transform(df_fin) 
final_data=output.select('features','rating') 
train_data,test_data=final_data.randomSplit([0.7,0.3]) 
rating_lr=LinearRegression(featuresCol='features',labelCol='rating')
train_rating_model=rating_lr.fit(train_data) 
rating_results=train_rating_model.evaluate(train_data)
unlabeled_data=test_data.select('features')
predictions=train_rating_model.transform(unlabeled_data) 
#pp=sqlcontext.read.json('inp1.json')
#pp.show()
#data=pp.map(lambda x: x.split(','))
#r=data.collect()
#print(r)
#import json

with open('inp1.json') as f:
  data = json.load(f)

match_date=data['date']
Exemplo n.º 5
0
 def scalarSparkLinearRegression(self):
     regr = LinearRegression()
     model = regr.fit(self.train)
     return model
Exemplo n.º 6
0
 def test_write_property(self):
     lr = LinearRegression(maxIter=1)
     self.assertTrue(isinstance(lr.write, MLWriter))
def main():
    # LOAD THE DATA
    total_features, total_prices = load_boston(True)
    col_list = load_boston()['feature_names']

    df = pd.DataFrame(total_features)
    df.columns = col_list
    df['price'] = total_prices
    print(df.head())

    # save to csv
    df.to_csv('boston.csv', index=False)
    # load data with spark way
    data = sc.textFile('boston.csv').map(lambda line: line.split(","))
    headers = data.first()
    traindata = data.filter(lambda row: row != headers)
    sqlContext = SQLContext(sc)
    dataFrame = sqlContext.createDataFrame(traindata, [
        'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
        'PTRATIO', 'B', 'LSTAT', 'price'
    ])
    dataFrame.take(2)
    print('-' * 70)
    print(dataFrame.take(30))
    print('-' * 70)
    # convert string to float in  PYSPARK

    # https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
    #for col in df.columns:
    #	dataFrame = dataFrame.withColumn("{}".format(), dataFrame[col].cast("float"))
    dataFrame = dataFrame.withColumn("CRIM_", dataFrame["CRIM"].cast("float"))
    dataFrame = dataFrame.withColumn("ZN_", dataFrame["ZN"].cast("float"))
    dataFrame = dataFrame.withColumn("price_",
                                     dataFrame["price"].cast("float"))
    dataFrame.registerTempTable("temp_sql_table")
    spark_sql_output = sqlContext.sql("""SELECT 
	                    CRIM_,
	                    ZN_,
	                    price_
	                    FROM temp_sql_table """)
    print(spark_sql_output.take(10))

    trainingData = spark_sql_output.rdd.map(
        lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
    trainingData.show()
    featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData)

    (trainingData, testData) = trainingData.randomSplit([0.7, 0.3])

    #################### SPARK ML  ####################

    # Define LinearRegression algorithm
    lr = LinearRegression()

    # Fit 2 models, using different regularization parameters
    modelA = lr.fit(trainingData, {lr.regParam: 0.0})
    modelB = lr.fit(trainingData, {lr.regParam: 100.0})

    # Make predictions
    predictionsA = modelA.transform(trainingData)
    print('-' * 70)
    print('MODEL A : ')
    predictionsA.select("prediction", "label", "features").show(30)
    print('-' * 70)

    predictionsB = modelB.transform(trainingData)
    print('-' * 70)
    print('MODEL B : ')
    predictionsB.select("prediction", "label", "features").show(30)
    print('-' * 70)

    # Evaluate the model
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictionsA)
    print('-' * 70)
    print("ModelA: Root Mean Squared Error = " + str(RMSE))
    print('-' * 70)
    # ModelA: Root Mean Squared Error = 128.602026843

    RMSE = evaluator.evaluate(predictionsB)
    print('-' * 70)
    print("ModelB: Root Mean Squared Error = " + str(RMSE))
    print('-' * 70)
Exemplo n.º 8
0
    df = spark.createDataFrame(
        [
            (3.0, 1.0),  ## y, x
            (7.0, 3.0),
            (5.0, 2.0)
        ],
        ["y", "x"])
    # 转换x列为Vector[]
    assembler = VectorAssembler(inputCols=["x"], outputCol="features")
    dfv = assembler.transform(df)
    print(len(dfv.rdd.take(1)[0]))
    dfv.show()
    lr = LinearRegression(labelCol='y',
                          maxIter=50,
                          regParam=0.01,
                          solver="normal",
                          fitIntercept=True)
    model = lr.fit(dfv)
    print("intercept: ", model.intercept)  # 偏置b
    print("---> ", model.coefficients)  # 各特征变量的系数a
    test0 = spark.createDataFrame([(4, Vectors.dense(4.0)),
                                   (5, Vectors.dense(5.0))],
                                  ["id", "features"])
    prediction = model.transform(test0)  # 预测
    prediction.show()
    selected = prediction.select("id", "features", "prediction")
    for row in selected.collect():
        print(row)
    # print(abs(model.transform(test0).head().prediction - (-1.0)) < 0.001)
Exemplo n.º 9
0
# Vector assembling features.
inputCols_assem = [x for x in df_r.columns if x not in ['id', 'label']]
assembler = VectorAssembler(\
        inputCols = inputCols_assem, \
        outputCol = 'features')
df_r = assembler.transform(df_r)

# Train test split the Data.
train, test = df_r.randomSplit([0.8, 0.2], seed=12345)
print 'Finished data preprocessing...'

# Fitting & predicting pipeline.
evaluator = RegressionEvaluator(metricName="mae")
# lr = LinearRegression().setSolver("l-bfgs")
lr = LinearRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [500]) \
                         .addGrid(lr.regParam, [0]) \
                         .addGrid(lr.elasticNetParam, [1]) \
                         .build()
lr_cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, \
                       evaluator=evaluator, numFolds=3)
lrModel = lr_cv.fit(train)  # Takes 30 min to run.
bestModel = lrModel.bestModel
print 'MAE: ', lrModel.avgMetrics
print 'Best Param (regParam): ', bestModel._java_obj.getRegParam()
print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter()
print 'Best Param (elasticNetParam): ', bestModel._java_obj.getElasticNetParam(
)
print 'Param Map: ', bestModel._java_obj.extractParamMap()
bestModel.save(
Exemplo n.º 10
0
  #######################
  # LOG HYPERPARAMETERS #
  #######################

  run.log("Model Name", model_name)
  run.log("Max Iterations", maxIters)
  run.log("Regularization Rate", regParam)
  run.log_list("Feature Columns", feature_cols)

  ###############
  # TRAIN MODEL #
  ###############

  print("  * Training {0} model".format(model_name))
  # Instantiate New LinearRegression Object
  lr = LinearRegression(featuresCol='features', labelCol='duration_minutes', maxIter=maxIters, regParam=regParam, solver="auto")

  # Train model on transformed training data
  lr_model = lr.fit(trainDF_transformed)

  lr_full_model = feature_model.copy()
  lr_full_model.stages.append(lr_model)

  print("  * Model trained, scoring validation data")
  # Run the full model (feature steps and trained model)
  validation_scored = lr_full_model.transform(validDF)

  #####################
  # MODEL PERFORMANCE #
  #####################
Exemplo n.º 11
0
model_df=features_df.select('features','output')     # 构建用于线性回归的数据模型

# 5-将数据划分为 训练数据和预测数据
train_df,test_df=model_df.randomSplit([0.7,0.3])     # 训练数据和预测数据的比例为 7比3

print((train_df.count(), len(train_df.columns)))

print((test_df.count(), len(test_df.columns)))

# 6-构建线性回归模型

from pyspark.ml.regression import LinearRegression         # 导入线性回顾库

print('-------------- 构建线性回归模型 ------------------')

lin_Reg=LinearRegression(labelCol='output')                 # labelCol,相对于featrues列,表示要进行预测的列

lr_model=lin_Reg.fit(train_df)                              # 训练数据 ,fit返回一个 fitted model,即LineRegressionModel对象

print('{}{}'.format('方程截距:',lr_model.intercept))         # intercept 线性方程的截距。

print('{}{}'.format('方程参数系数:',lr_model.coefficients))  # 回归方程中的,变量参数 ,这里分别对应var_1,var_2,var_3,var_4,var_5

training_predictions=lr_model.evaluate(train_df)            # 查看预测数据

print('{}{}'.format('误差差值平方:',training_predictions.meanSquaredError))            # 误差值差值平方   

print('{}{}'.format('判定系数:',training_predictions.r2 ))  # r2 判定系数,用来判定,构建的模型是否能够准确的预测,越大说明预测的准确率越高

# 7-使用预测数据,用已经到构建好的预测模型 lr_model
test_results=lr_model.evaluate(test_df)
Exemplo n.º 12
0
def test_param_search_estimator(  # pylint: disable=unused-argument
        metric_name, param_search_estimator, spark_session,
        dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    lrParamMaps = [
        {
            lr.maxIter: 1,
            lr.standardization: False
        },
        {
            lr.maxIter: 200,
            lr.standardization: True
        },
        {
            lr.maxIter: 2,
            lr.standardization: False
        },
    ]
    best_params = {
        "LinearRegression.maxIter": 200,
        "LinearRegression.standardization": True
    }
    eva = RegressionEvaluator(metricName=metric_name)
    estimator = param_search_estimator(estimator=lr,
                                       estimatorParamMaps=lrParamMaps,
                                       evaluator=eva)
    with mlflow.start_run() as run:
        model = estimator.fit(dataset_regression)
        estimator_info = load_json_artifact("estimator_info.json")
        metadata = _gen_estimator_metadata(estimator)
        assert metadata.hierarchy == estimator_info["hierarchy"]

        param_search_estiamtor_info = estimator_info[
            metadata.uid_to_indexed_name_map[estimator.uid]]
        assert param_search_estiamtor_info[
            "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively(
                lr, 1, metadata.uid_to_indexed_name_map)
        assert param_search_estiamtor_info[
            "tuning_parameter_map_list"] == _get_tuning_param_maps(
                estimator, metadata.uid_to_indexed_name_map)

        assert best_params == load_json_artifact("best_parameters.json")

        search_results = load_json_csv("search_results.csv")

    uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values({
            **_get_instance_param_map(estimator, uid_to_indexed_name_map),
            **{f"best_{k}": v
               for k, v in best_params.items()},
        }))
    assert run_data.tags == get_expected_class_tags(estimator)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == model.uid
    loaded_best_model = load_model_by_run_id(run_id, "best_model")
    assert loaded_best_model.stages[0].uid == model.bestModel.uid
    assert run_data.artifacts == [
        "best_model",
        "best_parameters.json",
        "estimator_info.json",
        "model",
        "search_results.csv",
    ]

    client = mlflow.tracking.MlflowClient()
    child_runs = client.search_runs(
        run.info.experiment_id,
        "tags.`mlflow.parentRunId` = '{}'".format(run_id))
    assert len(child_runs) == len(search_results)

    for row_index, row in search_results.iterrows():
        row_params = json.loads(row.get("params", "{}"))
        for param_name, param_value in row_params.items():
            assert param_value == row.get(f"param.{param_name}")

        params_search_clause = " and ".join([
            "params.`{}` = '{}'".format(key.split(".")[1], value)
            for key, value in row_params.items()
        ])
        search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format(
            run_id, params_search_clause)
        child_runs = client.search_runs(run.info.experiment_id, search_filter)
        assert len(child_runs) == 1
        child_run = child_runs[0]
        assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
        run_data = get_run_data(child_run.info.run_id)
        child_estimator = estimator.getEstimator().copy(
            estimator.getEstimatorParamMaps()[row_index])
        assert run_data.tags == get_expected_class_tags(child_estimator)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values({
                **_get_instance_param_map(child_estimator, uid_to_indexed_name_map)
            }))
        assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) ==
                mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME)

        metric_name = estimator.getEvaluator().getMetricName()
        if isinstance(estimator, CrossValidator):
            avg_metric_value = model.avgMetrics[row_index]
            avg_metric_name = f"avg_{metric_name}"
        else:
            avg_metric_value = model.validationMetrics[row_index]
            avg_metric_name = metric_name

        assert math.isclose(avg_metric_value,
                            run_data.metrics[avg_metric_name],
                            rel_tol=1e-6)
        assert math.isclose(avg_metric_value,
                            float(row.get(avg_metric_name)),
                            rel_tol=1e-6)

        if isinstance(estimator, CrossValidator) and Version(
                pyspark.__version__) >= Version("3.3"):
            std_metric_name = f"std_{metric_name}"
            std_metric_value = model.stdMetrics[row_index]
            assert math.isclose(std_metric_value,
                                run_data.metrics[std_metric_name],
                                rel_tol=1e-6)
            assert math.isclose(std_metric_value,
                                float(row.get(std_metric_name)),
                                rel_tol=1e-6)
Exemplo n.º 13
0
 def test_attr_spark(self):
     conf = SparkConf().setAppName("toy_test").setMaster('local[2]')
     num_partitions = 2
     enumerator = "join"
     model_type = "regression"
     label = 'target'
     sparkContext = SparkContext(conf=conf)
     sqlContext = SQLContext(sparkContext)
     train_df = sqlContext.read.csv("toy_train.csv", header='true',
                         inferSchema='true')
     test_df = sqlContext.read.csv("toy.csv", header='true',
                         inferSchema='true')
     # initializing stages of main transformation pipeline
     stages = []
     # list of categorical features for further hot-encoding
     cat_features = ['a', 'b', 'c']
     for feature in cat_features:
         string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip")
         encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
         encoder.setDropLast(False)
         stages += [string_indexer, encoder]
     assembler_inputs = [feature + "_vec" for feature in cat_features]
     assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
     stages += [assembler]
     assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
     stages += [assembler_final]
     pipeline = Pipeline(stages=stages)
     train_pipeline_model = pipeline.fit(train_df)
     test_pipeline_model = pipeline.fit(test_df)
     train_df_transformed = train_pipeline_model.transform(train_df)
     test_df_transformed = test_pipeline_model.transform(test_df)
     train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0))
     test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0))
     decode_dict = {}
     counter = 0
     cat = 0
     for feature in cat_features:
         colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap()
         colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])}
         for item in colIdx:
             decode_dict[counter] = (cat, item, colIdx[item], counter)
             counter = counter + 1
         cat = cat + 1
     train_df_transform_fin = train_df_transformed.select('features', label, 'model_type')
     test_df_transform_fin = test_df_transformed.select('features', label, 'model_type')
     lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8)
     lr_model = lr.fit(train_df_transform_fin)
     eval = lr_model.evaluate(test_df_transform_fin)
     f_l2 = eval.meanSquaredError
     pred = eval.predictions
     pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type']))
     predictions = pred_df_fin.select('features', 'error').repartition(num_partitions)
     converter = IndexToString(inputCol='features', outputCol='cats')
     all_features = list(decode_dict)
     predictions = predictions.collect()
     spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join")
     spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha,
                                   k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union")
     self.assertEqual(3, len(spark_join.slices))
     print("check1")
     self.assertEqual(spark_join.min_score, spark_union.min_score)
     print("check2")
     self.assertEqual(spark_join.keys, spark_union.keys)
     print("check3")
     self.assertEqual(len(spark_join.slices), len(spark_union.slices))
     print("check4")
     idx = -1
     for sliced in spark_join.slices:
         idx += 1
         self.assertEqual(sliced.score, spark_union.slices[idx].score)
     print("check5")
Exemplo n.º 14
0
def pipeline(request):
    unique_fields = custom_fields(request)
    date_column = CustomFields.objects.first()
    date_column = date_column.date_column

    # First, read the data
    data_df = read_df(request, 'clean')
    json_df = data_df.toPandas()
    json_df.to_json()

    # Cast all the columns to numeric
    new_df = data_df.select(
        [col(c).cast("double").alias(c) for c in data_df.columns])
    new_df = new_df.fillna(0.0)
    new_df.show()

    # Split data into training and test sets
    train, test = new_df.randomSplit([0.7, 0.3])

    # Feature Processing
    featuresCols = new_df.columns
    featuresCols.remove(unique_fields['prediction'])

    try:
        featuresCols.remove(date_column)
    except:
        pass

    # This concatenates all feature columns into a single feature vector in a new column 'rawFeatures'
    vectorAssembler = VectorAssembler(inputCols=featuresCols,
                                      outputCol='rawFeatures')

    # Model Training
    standardScaler = StandardScaler(inputCol="rawFeatures",
                                    outputCol="features")
    lr = LinearRegression(labelCol=unique_fields['prediction'],
                          maxIter=10,
                          regParam=.01)

    # Model tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.maxIter, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.fitIntercept, [False, True]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    # We define an evaluation metric.
    # This tells CrossValidator how well we are doing by comparing the true labels with predictions
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol=lr.getLabelCol(),
                                    predictionCol=lr.getPredictionCol())

    # Declare the CrossValidator which runs model tuning for us.
    cv = CrossValidator(estimator=lr,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid)

    stages = [vectorAssembler, standardScaler, cv]

    # Train the pipeline
    pipeline = Pipeline(stages=stages)

    model = pipeline.fit(train)
    predictions = model.transform(test)

    rmse = evaluator.evaluate(predictions)
    print("RMSE on our test set is: " + str(rmse))

    predictions.show()

    predicted_df = predictions.toPandas()
    predicted_df.to_json()
    # rmse = 23
    context = {'all_data': json_df, 'rmse': rmse, 'predicted': predicted_df}
    return render(request, 'show_predictions.html', context)
Exemplo n.º 15
0
                    inferSchema=True)
ad.show(5)

# In[3]:
# Transform data structure

from pyspark.ml.linalg import Vectors
ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[1:4]), x[-1]]).toDF(
    ['features', 'label'])
ad_df.show(5)

# In[4]:
# Build linear regression model

from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='label')

# In[5]:
# Fit the model
lr_model = lr.fit(ad_df)

# In[6]:
# Prediction
pred = lr_model.transform(ad_df)
pred.show(5)

# In[7]:
# Module evaluation

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
Exemplo n.º 16
0
# Feature scaling
standardScaler = StandardScaler(inputCol="features",
                                outputCol="features_scaled")
scaler = standardScaler.fit(df)
scaled_df = scaler.transform(df)

# Split data into training and test
train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234)
train_data = train_data.select("features_scaled", "label")
test_data = test_data.select("features_scaled", "label")
train_data = train_data.withColumnRenamed("features_scaled", "features")
test_data = test_data.withColumnRenamed("features_scaled", "features")

# Declare regression ML model
lr = LinearRegression(labelCol="label",
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)

# Train model on training data
linearModel = lr.fit(train_data)

# Test model on test data
predicted = linearModel.transform(test_data)
predictions = predicted.select("prediction").rdd.map(lambda x: x[0])
labels = predicted.select("label").rdd.map(lambda x: x[0])
predictionAndLabel = predictions.zip(labels).collect()
print predictionAndLabel[:5]

# model stats
#linearModel.coefficients
#linearModel.intercept
Exemplo n.º 17
0
# Check column data types
print('\n', cars.dtypes, '\n')

assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'],
                            outputCol='features')
cars = assembler.transform(cars)

kars = cars.select('consumption', 'features')

print(kars.toPandas().sample(12))

# Split the data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

regression = LinearRegression(labelCol='consumption').fit(kars_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(kars_test)
print("\nStandard Linear Regression")
#print("\nStandard Linear Regression\nSample")
#print(predictions.toPandas().sample(12))

# Print the coefficients and RMSE for linear regression
trainingSummary = regression.summary
print("Coefficients: %s" % str(regression.coefficients))
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

# Ridge regression
ridge = LinearRegression(labelCol='consumption', elasticNetParam=0, regParam=0.1).fit(kars_train)
# Create predictions for the testing data and take a look at the predictions
Exemplo n.º 18
0
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator

stockSI = StringIndexer(inputCol="Symbol",outputCol="indexedSymbol", handleInvalid='keep')
sectorSI = StringIndexer(inputCol='Sector', outputCol='indexedSector', handleInvalid='keep')
industrySI = StringIndexer(inputCol='Industry', outputCol='indexedIndustry', handleInvalid='keep')
stockEnc = OneHotEncoder(inputCol="indexedSymbol", outputCol="encodedSymbol")
sectorEnc = OneHotEncoder(inputCol="indexedSector", outputCol="encodedSector")
industryEnc = OneHotEncoder(inputCol="indexedIndustry", outputCol="encodedIndustry")
catVa = VectorAssembler(inputCols=["encodedSymbol","encodedSector","encodedIndustry"], outputCol="catFeatures")
numVa = VectorAssembler(inputCols["yVolume",'ySpread',"yPriceChange",], outputCol="numFeatures")

splits = [-float("inf"), -0.2, -0.15, -0.1, -0.05, -0.0, 0.05, 0.1, 0.2, float("inf")]


norm = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
stockLr = LinearRegression(labelCol='PriceChange', featuresCol='features', predictionCol='pPriceChange')

stages = [stockSI, sectorSI, industrySI, stockEnc, sectorEnc, industryEnc, va, norm]

lrPipeline = Pipeline(stages=stages+[stockLr])

(trainingData, testData) = priceChanges.randomSplit([0.7, 0.3])

lrModel = lrPipeline.fit(trainingData)
lrPredictions = lrModel.transform(testData)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.stages[8].coefficients))
print("Intercept: %s" % str(lrModel.stages[8].intercept))

# Summarize the model over the training set and print out some metrics
Exemplo n.º 19
0
flights_assembled = assembler.transform(flights_to_model)
flights_assembled.show(5)

# Randomly split the assembled data into a training
# sample (70% of records) and a test sample (30% of
# records):

(train, test) = flights_assembled.randomSplit([0.7, 0.3])

# Import and use `LinearRegression` to specify the linear
# regression model and fit it to the training sample:

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="arr_delay")

lr_model = lr.fit(train)

# Examine the model intercept and slope:

lr_model.intercept

lr_model.coefficients

# Evaluate the linear model on the test sample:

lr_summary = lr_model.evaluate(test)

# R-squared is the fraction of the variance in the test
# sample that is explained by the model:
Exemplo n.º 20
0
housing_train_df.show(5)
from pyspark.sql.functions import round
housing_train_df = housing_train_df.withColumn("label", round('log_price', 4))

print("-----%%-----")

housing_train_df.show(5)

seed = 42

train_df, test_df = housing_train_df.randomSplit( [0.7, 0.3], seed=seed)

from pyspark.ml.regression import LinearRegression

linreg = LinearRegression(maxIter=500, regParam=0.0)

lm = linreg.fit(train_df)

print("Intercept ", lm.intercept)
print("Coefficients ", lm.coefficients)

y_pred = lm.transform(test_df)

y_pred.select('features', 'label', 'prediction').show(5)

from pyspark.sql.functions import exp

y_pred = y_pred.withColumn("y_pred", exp('prediction'))

y_pred.show(5)
Exemplo n.º 21
0
featureAssembler = VectorAssembler(inputCols=[
    "wheel-base", "length", "width", "height", "curb-weight", "engine-size",
    "compression-ratio", "city-mpg", "highway-mpg"
],
                                   outputCol='features')

output = featureAssembler.transform(training)

splits = output.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

lr = LinearRegression(featuresCol='features',
                      labelCol='price',
                      maxIter=10,
                      regParam=0.3,
                      elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

# # Fit the model
# lrModel = lr.fit(training)
#
# # Print the coefficients and intercept for linear regression
# print("Coefficients: %s" % str(lrModel.coefficients))
Exemplo n.º 22
0
def train(p_lag, dataFrame, model_path=None):
    # print(p_lag)
    current_lag = p_lag

    # df_len_ori: number of variables in model, K
    x_list = dataFrame.columns
    # print('x_list',x_list)
    df_len_ori = len(x_list)
    # print("df_len_ori is ")
    # print(df_len_ori)
    dataFrame_names = dataFrame.columns
    dataFrame = dataFrame.withColumn("id", monotonically_increasing_id())
    # dataFrame.printSchema()
    # dataFrame.show(10)
    # Here, VAR model regression_type is "const" same to R VAR library, and the default in Python VAR library
    # w = Window().partitionBy().orderBy(col("id"))
    w = Window().partitionBy().orderBy(col("id"))
    df_len = len(dataFrame.columns)
    ys_lagged_list = ["const"]
    # Making sure first column is not considered for forecasting
    for i in range(1, p_lag + 1):
        for j in range(0, df_len - 1):
            # making sure index column is not considered as feature column
            if x_list[j] != 'TimeStamp':
                ys_lagged_list.append("%st-%s" % (x_list[j], str(i)))
                print('2', ys_lagged_list)
                dataFrame = dataFrame.withColumn(
                    "%st-%s" % (x_list[j], str(i)),
                    lag(dataFrame[j], i, 0).over(w))
                # print('3')
    # print("Showing DataFrame")
    dataFrame.show(5)
    print('ys_lagged_list', ys_lagged_list)

    # add "const" column of value 1 to get intercept when fitting the regression model
    dataFrame = dataFrame.withColumn("const", lit(1))
    dataFrame = dataFrame.withColumn("const", lag("const", p_lag, 0).over(w))
    dataFrame = dataFrame.withColumn("rid", monotonically_increasing_id())
    dataFrame = dataFrame.filter(dataFrame.rid >= p_lag)
    # dataFrame.show(5)
    #     build ys_lagged dataframe, will be used in F-test
    ys_lagged = dataFrame.select(ys_lagged_list)
    ys_lagged_len = ys_lagged.count()
    # print('ye dikhai lagged value')
    # ys_lagged.show(10)

    dataFrame = dataFrame.drop('id')
    dataFrame = dataFrame.drop('rid')
    dataFrame = dataFrame.drop('const')
    input_feature_name = dataFrame.schema.names

    # input_feature_name.remove("id")
    for x_name in x_list:
        input_feature_name.remove('{}'.format(x_name))

    # assemble the vector for MLlib linear regression
    assembler_for_lag = VectorAssembler(inputCols=input_feature_name,
                                        outputCol="features")

    # a = {}
    # b = {}
    models = {}
    lrModels = []
    # print('Iterating the features')
    for select_y in x_list:
        if select_y != 'TimeStamp':
            model_key = '{}'.format(select_y)
            # ML model will be trained for each micro batch if existing model is not provided
            # print('model path',model_path+ '{}'.format(select_y))
            lr = LinearRegression(featuresCol='features',
                                  labelCol='{}'.format(select_y),
                                  maxIter=1000,
                                  fitIntercept=True)
            pipeline = Pipeline(stages=[assembler_for_lag, lr])
            model_val = pipeline.fit(dataFrame)
            # model_val.write().overwrite().save(model_path+'{}'.format(select_y))
            lrModels.append('{}'.format(select_y))
            models['{}'.format(select_y)] = model_val
    return lrModels, models
Exemplo n.º 23
0
  bin/spark-submit examples/src/main/python/ml/train_validation_split.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TrainValidationSplit")\
        .getOrCreate()

    # $example on$
    # Prepare training and test data.
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")
    train, test = data.randomSplit([0.9, 0.1], seed=12345)

    lr = LinearRegression(maxIter=10)

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.fitIntercept, [False, True])\
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(
        estimator=lr,
        estimatorParamMaps=paramGrid,
    return v_assembler.transform(data)

if __name__ == "__main__":
    train_ratio = 0.8
    test_ratio = 1 - train_ratio

    # create SparkSession - the entry to the cluster
    spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("Linear regression with pipeline - Boston").getOrCreate()

    data = prepare_data("BostonHousing.csv")

    # split data into train and test DataFrames
    train, test = data.randomSplit([train_ratio, test_ratio])

    poly_exp = PolynomialExpansion(degree=3, inputCol="features", outputCol="poly_features")

    lr = LinearRegression(regParam=0.1, featuresCol="poly_features")

    pipeline = Pipeline(stages=[poly_exp, lr])
    # fit the model
    model = pipeline.fit(train)

    evaluator = RegressionEvaluator()

    prediction_and_labels = model.transform(train).select("prediction", "label")
    print("Precision train: " + str(evaluator.evaluate(prediction_and_labels)))

    prediction_and_labels = model.transform(test).select("prediction", "label")
    print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))
Exemplo n.º 25
0
 def binomialSparkLinearRegression(self):
     regr = LinearRegression()
     model = regr.fit(self.Xtrain, self.Ytrain)
     return model
],
                            outputCol='features')

output = assembler.transform(data)  # adds a "features" column
# dependent variable = "Yearly Amount Spent"
final_data = output.select('features', 'Yearly Amount Spent')

### Train vs Test split
train_data, test_data = final_data.randomSplit([0.7, 0.3])
#train_data.describe().show() #summary statistics of label column

###################################
## Fit Linear Regression Model  ###
###################################
# train model on training set
lr = LinearRegression(labelCol='Yearly Amount Spent')
lr_model = lr.fit(train_data)
# predict test set
test_results = lr_model.evaluate(test_data)

###################################
## EVALUATE LINEAR REGRESSION  ####
###################################

#test_results.residuals.show() # residuals
print(test_results.rootMeanSquaredError)  # root mean squared error
print(test_results.r2)  # R squared
print(test_results.meanAbsoluteError)
print(test_results.meanSquaredError)

###########################################################
Exemplo n.º 27
0
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ["close", "features"]
df = df.select(selectedCols)
df.printSchema()

# split to train & test
train, test = df.randomSplit([0.7, 0.3], seed=2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

# fit
lr = LinearRegression(maxIter=5,
                      regParam=0.0,
                      solver="normal",
                      labelCol="close")
lrModel = lr.fit(train)

trainingSummary = lrModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

train.describe().show()

# get prediction
lr_predictions = lrModel.transform(test)
lr_predictions.select("prediction", "close", "features").show(5)

# R2 on test
lr_evaluator = RegressionEvaluator(predictionCol="prediction",
Exemplo n.º 28
0
+-------+------------------+
|summary|       Petal_Width|
+-------+------------------+
|  count|               108|
|   mean|1.1703703703703703|
| stddev|0.7605039228590301|
|    min|               0.1|
|    max|               2.5|
+-------+------------------+
"""

# let's create the regression model
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features',
                      labelCol='Petal_Width',
                      predictionCol='prediction')

# adapt it to the data
lr_model = lr.fit(train)

# print the coefficients
print("Coefficients: {} Intercept: {}".format(lr_model.coefficients,
                                              lr_model.intercept))
# Coefficients: [-0.2384270208878119,0.20670865063951435,0.5267224329790431] Intercept: -0.03679454895160388

# let's create predictions on test data
test_features = test.select('features')

predictions = lr_model.transform(test_features)
import os
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.getOrCreate()

data_path = os.getcwd() + '\\data'

json_df1_path = data_path + '\\example_8.json'

df = spark.read.json(json_df1_path, multiLine=True)

df.show()

vectorAssembler = VectorAssembler(inputCols=["diameter"], outputCol="features")

df_vector = vectorAssembler.transform(df)

df_vector.show()

lr = LinearRegression(featuresCol="features", labelCol="diameter")

lrModel = lr.fit(df_vector)

lrModel.coefficients

lrModel.intercept

lrModel.summary.rootMeanSquaredError
# In[136]:

from pyspark.ml.regression import RandomForestRegressor, LinearRegression
# This is our regressor
from pyspark.ml.evaluation import RegressionEvaluator
# Module to evaluate fit

# In[127]:

rf = RandomForestRegressor(labelCol="ups", featuresCol="features", numTrees=5)
# This is our regressor="score", featuresCol="features", numTrees=5)
# Create a random forest regressor

# In[39]:

rf = LinearRegression(labelCol="ups", featuresCol="features")
# Create a linear regressor

# In[128]:

(trainingData, testData) = vectorDf.randomSplit([0.7, 0.3])

# In[129]:

model = rf.fit(trainingData)

# In[130]:

predictions = model.transform(testData)

# In[132]: