def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def model(): data = sql.read.parquet(str(DATA_PARQUET)) data.createOrReplaceTempView('data') sample = sql.sql(''' select hash_number_A ,interest_1 ,phone_price_category ,sum(cost) as label from data group by hash_number_A, interest_1, phone_price_category''') # ,phone_price_category pipeline = Pipeline(stages=[ StringIndexer(inputCol='interest_1', outputCol='interest'), StringIndexer(inputCol='phone_price_category', outputCol='phone_price'), VectorAssembler(inputCols=['interest', 'phone_price'], outputCol='features'), ]) model_data = pipeline.fit(sample) sample = model_data.transform(sample) # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie' regression = GeneralizedLinearRegression(family='gaussian', labelCol='label', featuresCol='features', maxIter=10, regParam=0.3) model = regression.fit(sample) breakpoint()
def generalizeRegression(df, label, features, adjust): """ This function returns the rmse and the predictions form the applied generalized regression model on the dataframe with the speficied feature columns """ ## Columns with non numerical values are adjusted for col in adjust: indexer = StringIndexer(inputCol=col, outputCol="{}_num".format(col)) features.append("{}_num".format(col)) df = indexer.fit(df).transform(df) ## Features vector configured from dataframe for model processing assembler = VectorAssembler(inputCols=features, outputCol="features") assembled = assembler.transform(df) gr = GeneralizedLinearRegression(featuresCol='features', labelCol=label, regParam=0.3, family="poisson") grModel = gr.fit(assembled) predictions = grModel.transform(assembled) ## Evaluator required for rmse estimation evaluator = RegressionEvaluator(labelCol=label, metricName="rmse") rmse = evaluator.evaluate(predictions) result = { "RMSE": rmse, "predictions": [r["prediction"] for r in predictions.select("prediction").collect()] } return result
def logisT(value): glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(value) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) return (model.coefficients,1)
def model_dev_glm(df_train, df_test, max_iter, fit_intercept, reg_param): glm_start_time = time() # Create an Initial Model Instance mod_glm = GeneralizedLinearRegression(labelCol='label', featuresCol='features', family="gaussian", link="identity", fitIntercept=fit_intercept, maxIter=max_iter, regParam=reg_param) # Training The Model glm_final_model = mod_glm.fit(df_train) # Scoring The Model On Test Sample glm_transformed = glm_final_model.transform(df_test) glm_test_results = glm_transformed.select(['prediction', 'label']) # Collecting The Model Statistics glm_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label") glm_r2 = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "r2"}), 3) glm_mse = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "mse"}), 3) glm_rmse = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "rmse"}), 3) glm_mae = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "mae"}), 3) # Printing The Model Statitics print("\n++++++ Printing Generalized Linear Model Accuracy ++++++\n") print("R Square: " + str(glm_r2 * 100) + "%") print("Mean Squared Error: " + str(glm_mse)) print("Root Mean Squared Error: " + str(glm_rmse)) print("Mean Absolute Error: " + str(glm_mae)) glm_end_time = time() glm_elapsed_time = (glm_end_time - glm_start_time) / 60 glm_model_stat = pd.DataFrame({ "Model Name": ["Generalized Linear Model"], "R Square": glm_r2, "Mean Squared Error": glm_mse, "Root Mean Squared Error": glm_rmse, "Mean Absolute Error": glm_mae, "Time (Min.)": round(glm_elapsed_time, 3) }) glm_output = (glm_final_model, glm_model_stat) return (glm_output)
def generalized_linear_regression(trainingDataFrame, family="gaussian", link="identity", maxIter=10, regParam=0.3): glr = GeneralizedLinearRegression(family=family, link=link, maxIter=maxIter, regParam=regParam) glrModel = glr.fit(trainingDataFrame) result = {} result["model"] = glrModel result["summary"] = glrModel.summary result["intercept"] = glrModel.intercept result["coefficients"] = glrModel.coefficients return result
def main(self, sc, *args): points_rdd = self.requires().get_points_rdd(sc) model = GeneralizedLinearRegression(family='poisson', link=self.link, maxIter=self.iterations) spark_sql = SparkSession.builder.getOrCreate() model = model.fit(spark_sql.createDataFrame(points_rdd)) model.save(self.output().path)
def linear_regression(ticker, writer): spark = SparkSession\ .builder\ .appName("GeneralizedLinearRegressionExample")\ .getOrCreate() # $example on$ # Load training data dataset1 = spark.read.format("libsvm")\ .load("../data/newlr/" + ticker + "_no_today.csv") glr1 = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model1 = glr1.fit(dataset1) with open("../data/tickers/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') count = 0 for row in reader: if count == 1: today_volume = row[5] count = count + 1 else: count = count + 1 # Print the coefficients and intercept for generalized linear regression model predict_close_value = -1 * float(str(model1.coefficients[0])) + float( str(today_volume)) * float(str(model1.coefficients[1])) + float( str(model1.intercept)) print(predict_close_value) today_close_value = 0 yesterday_close_value = 0 with open("../data/newlr/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',') count = 0 for row in reader: if count is 0: today_close_value = row[0] count += 1 elif count is 1: yesterday_close_value = row[0] break spark.stop() if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value: return True elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value: return True else: return False
def test_offset(self): df = self.spark.createDataFrame( [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"]) glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
def regression(train_set, test_set, featuresColumn, labelColumn): regressor = GeneralizedLinearRegression(featuresCol=featuresColumn, labelCol=labelColumn, family="gaussian", link="log", maxIter=10, regParam=0.3) regressor = regressor.fit(train_set) predict_results = regressor.evaluate(test_set) result = predict_results.predictions return result
def best_subset_selection_GLM(df, labelCol, Cols, label_is_categorical=False, family='gaussian', link='identity'): print('Total number of iterations: {}'.format(2**len(Cols))) AIC_values, feature_list, num_features = [], [], [] for k in np.arange(1, len(Cols) + 1): for i, combo in enumerate(itertools.combinations(Cols, k)): continuousCols, categoricalCols = [], [] for col in list(combo): data_type = str(df.schema[col].dataType) if data_type == 'StringType': categoricalCols.append(col) else: continuousCols.append(col) data = prepare_data(df=df, labelCol=labelCol, label_is_categorical=False, categoricalCols=categoricalCols, continuousCols=continuousCols) model = GeneralizedLinearRegression(family=family, link=link, featuresCol='features', labelCol='label') AIC = model.fit(data).summary.aic AIC_values.append(AIC) feature_list.append(combo) num_features.append(len(combo)) print('Feature/s: {}, AIC={:.3f}'.format(combo, AIC)) return pd.DataFrame({ 'num_features': num_features, 'AIC': AIC_values, 'features': feature_list }).rename_axis('Model ID').sort_values('AIC', ascending=False)
def create_model(training_data, features_col, label_col): """ Create machine learning model :param training_data: -- dataframe: training dataset :param features_col: -- col: containing all the features needed. :param label_col: -- col: label :return: model created and its evaluator """ # Create Generalized Linear Regression Model glr = GeneralizedLinearRegression() # Create params for the model params = ParamGridBuilder().baseOn({ glr.labelCol: label_col }).baseOn({ glr.featuresCol: features_col }).addGrid(glr.family, ["gaussian", "poisson"]).build() # Model Evaluator glr_evaluator = RegressionEvaluator(labelCol=label_col) # Create model with Cross Validation to get the best results glr_cv = CrossValidator(estimator=glr, estimatorParamMaps=params, evaluator=glr_evaluator) dt_cv_model = glr_cv.fit(training_data) return dt_cv_model, glr_evaluator
def test_tweedie_distribution(self): df = self.spark.createDataFrame( [(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4)) model2 = glr.setLinkPower(-1.0).fit(df) self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4)) self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
def test_glr_load(self): df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0))], ["label", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") model = glr.fit(df) self.assertEqual(model.getSolver(), "irls") transformed1 = model.transform(df) path = tempfile.mkdtemp() model_path = path + "/glr" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) self.assertEqual(model2.getSolver(), "irls") transformed2 = model2.transform(df) self.assertEqual(transformed1.take(4), transformed2.take(4))
def test_glr_summary(self): from pyspark.ml.linalg import Vectors df = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"], ) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.numInstances, 2) self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue( isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue( isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue( isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, str)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class GeneralizedLinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def train_fit_glmm(window, date_label: str): poisson_regression = GeneralizedLinearRegression(family="poisson", link="log", maxIter=10, regParam=0.3) columns = [ denoise("train").alias("features"), F.expr(f"{date_label} as label") ] model = poisson_regression.fit(window.select(*columns)) # TODO: may want to persist the fitted model observations = model.transform( window.withColumn("features", denoise("retrain"))) columns = ["page_id", "train", "validate", "retrain", "test", "prediction"] return observations.select(*columns)
def GL(df_data): print("Train a GeneralizedLinearRegression model...") t1 = time.time() gl_model = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) \ .setFeaturesCol("features") \ .setLabelCol("label") \ .fit(df_data) t2 = time.time() - t1 print("gl_model using time: %.2fs\n" % t2) return gl_model
def algorithm(target): rf = RandomForestRegressor(featuresCol='Features', labelCol=target) gbt = GBTRegressor(featuresCol='Features', labelCol=target) dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target) lr = LinearRegression(featuresCol='Features', labelCol=target) glr = GeneralizedLinearRegression(family="gaussian", link="identity", featuresCol='Features', labelCol=target) model = [gbt, dt, lr, glr, rf] return rf, gbt, dt, lr, glr, model
def main(argv): # Name of prediction column label = argv[1] start = time.time() spark = SparkSession.builder \ .master("local[*]") \ .appName("datasetRegressor") \ .getOrCreate() data = spark.read.parquet(argv[0]).cache() vector = data.first() print(vector) featureCount = len(vector) print("Feature count : {featureCount}") print("Dataset size (unbalanced) : {data.count()}") testFraction = 0.3 seed = 123 # Linear Regression lr = LinearRegression().setLabelCol(label) \ .setFeaturesCol("features") reg = sparkRegressor(lr, label, testFraction, seed) matrics = reg.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") # GBTRegressor gbt = GBTRegressor().setLabelCol(label) \ .setFeaturesCol("features") reg = sparkRegressor(gbt, label, testFraction, seed) matrics = reg.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") # GeneralizedLinearRegression glr = GeneralizedLinearRegression().setLabelCol(label) \ .setFeaturesCol("features") \ .setFamily("gaussian") \ .setLink("identity") \ .setMaxIter(10) \ .setRegParam(0.3) reg = sparkRegressor(glr, label, testFraction, seed) matrics = reg.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") end = time.time() print("Time: %f sec." % (end - start))
def _model(self): if self.family == GAUSSIAN_: reg = LinearRegression() elif self.family == BINOMIAL_: reg = GeneralizedLinearRegression(family="binomial", link="logit") else: raise NotImplementedError("Family '{}' not implemented".format( self.family)) reg.setLabelCol(self.response) reg.setMaxIter(self.__max_iter) return reg
def get_glm_pipeline_stages(categorical_columns, continuous_columns, label_column, family="tweedie", link="identity", variance_power=0.0, link_power=1.0) -> List: encoders = [] for c in categorical_columns: indexer = StringIndexer(inputCol=c, outputCol=f"{c}_IDX") encoders.append(indexer) encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=f"{c}_OHE", dropLast=False) encoders.append(encoder) features_column = f"features_{label_column}" prediction_column = f"prediction_{label_column}" assembler = VectorAssembler( inputCols=[f"{c}_OHE" for c in categorical_columns] + continuous_columns, outputCol=features_column) if family == 'tweedie': _model = GeneralizedLinearRegression(labelCol=label_column, featuresCol=features_column, predictionCol=prediction_column, family=family, linkPower=link_power, variancePower=variance_power) else: _model = GeneralizedLinearRegression(labelCol=label_column, featuresCol=features_column, predictionCol=prediction_column, family=family, link=link) stages: List = encoders + [assembler, _model] return stages
def GL_for(df_data): print("Train a GeneralizedLinearRegression model...") t1 = time.time() family = ['gaussian', 'binomial', 'poisson'] for family_name in family: gl_model = GeneralizedLinearRegression(family=family_name, link="identity", maxIter=10, regParam=0.3) \ .setFeaturesCol("features") \ .setLabelCol("label") \ .fit(df_data) t2 = time.time() - t1 print("gl_model using time: %.2fs\n" % t2) return gl_model
def linear_regression(ticker,writer): spark = SparkSession \ .builder \ .appName("GeneralizedLinearRegressionExample") \ .getOrCreate() # Load training data dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8) # Fit the model model = glr.fit(dataset) data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept] writer.writerow(data) print(data) # predict today_close_value = 0 yesterday_close_value = 0 with open("../data/lr/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',') count = 0 for row in reader: if count is 0: today_close_value = row[0] count += 1 elif count is 1: yesterday_close_value = row[0] break # # print(today_close_value) # # print(yesterday_close_value) predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept)) # print(predict_close_value) spark.stop() if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value: return True elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value: return True else: return False
def store_multiple_trained_models(): print("-- store_multiple_trained_models") spark = SparkSession.builder \ .appName("tryout") \ .getOrCreate() sc = spark.sparkContext # Create small df if not exists # hlp.create_small_dataframe(spark) # Read data and filter for traing data pp: DataFrame = hlp.readFromDatadirParquet(spark, "s5_01") \ .where(F.col("label").isNotNull()) # Create key column key_udf = F.udf(lambda a, b: f"{a}_{b}", T.StringType()) pp1 = pp.withColumn('key', key_udf(pp.item_id, pp.store_id)) # pp1.show() pp1.describe().show() # data ordered by key pp2 = pp1 \ .sort('key') def train_simple(data: DataFrame, esti: Estimator, key: str): print(f"--- train_simple {key}") # Prepare training and test data. df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345) # Run TrainValidationSplit, and choose the best set of parameters. trained_model: Transformer = esti.fit(df_train) # Make predictions on test data. model is the model with combination of parameters # that performed best. predictions = trained_model.transform(df_test) \ .select("features", "label", "prediction") # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print(f"-- (RMSE) for {key} {rmse}") keys = chain(*pp1.select("key").distinct().orderBy('key').take(200)) for k in keys: pp3 = pp2.filter(f"key = '{k}'") esti = GeneralizedLinearRegression(family='gaussian', link='identity') train_simple(pp3, esti, k)
def create_models(ml_models, train_df, test_df, train_FM_df, test_FM_df): # Declare evaluator for crossvalidation evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") for mo in ml_models: #, maxMemoryInMB=5000 if mo == "LR": print("---- Linear Regression ----") model = LinearRegression(featuresCol="features", labelCol="label") paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.1, 0.01]).addGrid(model.elasticNetParam, [0.1, 0.6]).build() elif mo == "GLR": print("---- Generalized linear Regression ----") model = GeneralizedLinearRegression(featuresCol="features", labelCol="label") paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.1, 0.01]).build() elif mo == "RF": print("---- Random Forest ----") model = RandomForestRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=5000) paramGrid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).addGrid(model.numTrees, [10 ,20 ]).build() elif mo == "GBT": print("---- Gradient Boost Tree ----") model = GBTRegressor(featuresCol="features", labelCol="label", maxMemoryInMB=5000) paramGrid = ParamGridBuilder().addGrid(model.maxDepth, [5, 10]).build() elif mo == "FM": print("---- Factorization Machines Regression ----") model = FMRegressor(featuresCol="features", labelCol="label") paramGrid = ParamGridBuilder().addGrid(model.regParam, [0.5, 0.3, 0.1, 0.01]).build() else: print(f"{mo} no detected as a ml model") if mo != "FM": # Cross validation #cval = CrossValidatorVerbose(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4) cval = CrossValidator(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4) cvModel = cval.fit(train_df) model_evaluation(cvModel, test_df, "features", "label") else: # Cross validation #cval = CrossValidatorVerbose(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4) cval = CrossValidator(estimator=model,estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4) cvModel = cval.fit(train_FM_df) model_evaluation(cvModel, test_FM_df, "features", "label")
def selectRegressionMethod(regressionMethodName, featureName): if regressionMethodName == "rf": if test == True: nt = 1 else: nt = 100 modelParameters = { 'featuresCol': featureName, 'numTrees': nt, 'subsamplingRate': 1, 'maxDepth': 10 } regressionMethod = RandomForestRegressor( featuresCol=modelParameters['featuresCol'], numTrees=modelParameters['numTrees'], subsamplingRate=modelParameters['subsamplingRate'], maxDepth=modelParameters['maxDepth']) elif regressionMethodName == "gbt": modelParameters = {'featuresCol': featureName, 'maxIter': 10} regressionMethod = GBTRegressor( featuresCol=modelParameters['featuresCol'], maxIter=modelParameters['maxIter']) elif regressionMethodName == "glr": modelParameters = { 'featuresCol': featureName, 'family': "poisson", 'link': 'log', 'maxIter': 10, 'regParam': 0.3 } regressionMethod = GeneralizedLinearRegression( family=modelParameters['family'], link=modelParameters['link'], maxIter=modelParameters['maxIter'], regParam=modelParameters['regParam']) else: print('Invalid regression method') return () #print('Regression method selected') return (regressionMethod, modelParameters)
def bestGeneralizedLR(trainDf, metricDF, metricToCompare): regParam = [1.0, 0.6, 0.2] tol = [1.0, 0.6, 0.2, 0.0] family = ["poisson", "gaussian"] link = {"poisson": ["identity", "sqrt", "log"], "gaussian": ["identity"]} models = [] for r in regParam: for f in family: for l in link.get(f): for t in tol: models.append( GeneralizedLinearRegression(maxIter=10, regParam=r, family=f, link=l, tol=t).fit(trainDf)) return getBestModel(models, metricDF, metricToCompare)
def generalized_linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ (1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression( family="gaussian", link="identity", ) # linkPredictionCol="p") model = glr.fit(df) transformed = model.transform(df) abs(transformed.head().prediction - 1.5) < 0.001 # True abs(transformed.head().p - 1.5) < 0.001 # True model.coefficients model.numFeatures # 2 abs(model.intercept - 1.5) < 0.001 # True temp_path = "./" glr_path = temp_path + "/glr" glr.save(glr_path) glr2 = GeneralizedLinearRegression.load(glr_path) glr.getFamily() == glr2.getFamily() # True model_path = temp_path + "/glr_model" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) model.intercept == model2.intercept # True model.coefficients[0] == model2.coefficients[0]
# COMMAND ---------- (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance))
], outputCol='features') v_data = vectorAssembler.transform(data) v_data.show(10) # 划分训练集,集测试集 vdata = v_data.select(['features', 'medv']) vdata.show(10) splits = vdata.randomSplit([0.7, 0.3]) train_data = splits[0] test_data = splits[1] # 训练 glr = GeneralizedLinearRegression(family="gaussian", link="identity", labelCol='medv', featuresCol='features', maxIter=1000, regParam=0.3) # Fit the model GlModel = glr.fit(train_data) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(GlModel.coefficients)) print("Intercept: " + str(GlModel.intercept)) # Summarize the model over the training set and print out some metrics summary = GlModel.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
assembler = VectorAssembler(inputCols=featureNames, outputCol="features") test_df = assembler.transform(test_df) test_df = test_df.select("id", "features") print("test vector assembled") test_df.show(5) # Split `train_df` into train and test sets (30% held out for testing) #Split train and test seed(0) (trainingData, testData) = train_df.randomSplit([0.7, 0.3]) # ## Logistic Regression #Fit logistic regression glr = GeneralizedLinearRegression(family="binomial", link="logit", featuresCol="features", labelCol="is_duplicate") trainLogitModel = glr.fit(trainingData) #Logistic model predictions LogitPredictions = trainLogitModel.transform(testData) # Calculate AUC evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate", rawPredictionCol="prediction", metricName="areaUnderROC") AUClogit = evaluator.evaluate(LogitPredictions) print("Logistic Regression AUC = %g " % AUClogit) # ## Decision trees #Fit decision tree model
# $example on$ from pyspark.ml.regression import GeneralizedLinearRegression # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GeneralizedLinearRegressionExample")\ .getOrCreate() # $example on$ # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance))
# Random Spliting training, testing = modelprep2.randomSplit([0.8, 0.2]) #modelprep2.count() #training.count() #testing.count() ####################################################################################### # # Modeling - GLM (Regression) # ####################################################################################### glm = GeneralizedLinearRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3) glmmodel = glm.fit(training) summary = glmmodel.summary # Show Coefficients and Intercept print("\nFeatures: " + str(features) + "\n") print("\nCoefficients: " + str(glmmodel.coefficients) + "\n") print("\nIntercept: " + str(glmmodel.intercept) + "\n") print("\nPValues: " + str(summary.pValues) + "\n") # Summarize the model over the training set and print out some metrics #print("\nCoefficient Standard Errors: " + str(summary.coefficientStandardErrors)) #print("T Values: " + str(summary.tValues)) #print("P Values: " + str(summary.pValues)) #print("Dispersion: " + str(summary.dispersion))
# COMMAND ---------- summary = lrModel.summary summary.residuals.show() print summary.totalIterations print summary.objectiveHistory print summary.rootMeanSquaredError print summary.r2 # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams() glrModel = glr.fit(df) # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ----------