def main(self, sc, *args): points_rdd = self.requires().get_points_rdd(sc) model = GeneralizedLinearRegression(family='poisson', link=self.link, maxIter=self.iterations) spark_sql = SparkSession.builder.getOrCreate() model = model.fit(spark_sql.createDataFrame(points_rdd)) model.save(self.output().path)
def generalized_linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ (1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression( family="gaussian", link="identity", ) # linkPredictionCol="p") model = glr.fit(df) transformed = model.transform(df) abs(transformed.head().prediction - 1.5) < 0.001 # True abs(transformed.head().p - 1.5) < 0.001 # True model.coefficients model.numFeatures # 2 abs(model.intercept - 1.5) < 0.001 # True temp_path = "./" glr_path = temp_path + "/glr" glr.save(glr_path) glr2 = GeneralizedLinearRegression.load(glr_path) glr.getFamily() == glr2.getFamily() # True model_path = temp_path + "/glr_model" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) model.intercept == model2.intercept # True model.coefficients[0] == model2.coefficients[0]