Пример #1
0
def execute():
    data = spark.read.csv('/Users/brillap/downloads/kc_house_data.csv', header=True, inferSchema=True)
    # data = spark.read.csv('hdfs://hadoop-master:9000/user/root/kc_house_data.csv', header=True, inferSchema=True)
    assembler = VectorAssembler() \
        .setInputCols(["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors"]) \
        .setOutputCol("features") \
        .transform(data)
    # assembler.show()

    normalizer = Normalizer() \
        .setInputCol("features") \
        .setOutputCol("normFeatures") \
        .setP(2.0) \
        .transform(assembler)
    # normalizer.show()

    linear_regression = LinearRegression() \
        .setLabelCol("price") \
        .setFeaturesCol("normFeatures") \
        .setMaxIter(10) \
        .setRegParam(1.0) \
        .setElasticNetParam(1.0)

    result_array = normalizer.randomSplit([0.7, 0.3])
    lr_model = linear_regression.fit(result_array[0])

    training_summary = lr_model.summary
    print("RMSE: %f" % training_summary.rootMeanSquaredError)

    predicted_data = lr_model.transform(result_array[1]).select("features", "normFeatures", "price", "prediction")
    predicted_data.show()
Пример #2
0
def execute():
    input_data = spark.read.csv('hdfs://hadoop-master:9000/kc_house_data.csv',
                                header=True,
                                inferSchema=True)
    # input_data = spark.read.csv('/Users/krithikab/Desktop/PRACT/SparkML/kc_house_data.csv', header=True, inferSchema=True)

    data = input_data\
        .filter(input_data.price > 0)\
        .withColumn("age", 2020-input_data.yr_built)\
        .drop_duplicates()

    assembler = VectorAssembler() \
        .setInputCols(
        ["bedrooms", "bathrooms", "sqft_living", "floors", "condition", "sqft_lot",
         "waterfront", "view", "grade", "sqft_above", "sqft_basement", "age",
         "zipcode", "lat", "long", "sqft_living15", "sqft_lot15"]) \
        .setOutputCol("features") \
        .transform(data)

    normalizer = Normalizer() \
        .setInputCol("features") \
        .setOutputCol("normFeatures") \
        .transform(assembler)

    linear_regression = LinearRegression() \
        .setLabelCol("price") \
        .setFeaturesCol("normFeatures") \
        .setMaxIter(10) \
        .setRegParam(1.0) \
        .setElasticNetParam(1.0)

    result_array = normalizer.randomSplit([0.7, 0.3])
    lr_model = linear_regression.fit(result_array[0])

    predicted_data = lr_model.transform(result_array[1]).select(
        "features", "normFeatures", "price", "prediction")
    # predicted_data.select("price", "prediction").write.csv("result.csv")
    predicted_data.select(
        "price",
        "prediction").write.csv("hdfs://hadoop-master:9000/prediction.csv")