def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def model(): data = sql.read.parquet(str(DATA_PARQUET)) data.createOrReplaceTempView('data') sample = sql.sql(''' select hash_number_A ,interest_1 ,phone_price_category ,sum(cost) as label from data group by hash_number_A, interest_1, phone_price_category''') # ,phone_price_category pipeline = Pipeline(stages=[ StringIndexer(inputCol='interest_1', outputCol='interest'), StringIndexer(inputCol='phone_price_category', outputCol='phone_price'), VectorAssembler(inputCols=['interest', 'phone_price'], outputCol='features'), ]) model_data = pipeline.fit(sample) sample = model_data.transform(sample) # 'gaussian', 'binomial', 'poisson', 'gamma', 'tweedie' regression = GeneralizedLinearRegression(family='gaussian', labelCol='label', featuresCol='features', maxIter=10, regParam=0.3) model = regression.fit(sample) breakpoint()
def generalizeRegression(df, label, features, adjust): """ This function returns the rmse and the predictions form the applied generalized regression model on the dataframe with the speficied feature columns """ ## Columns with non numerical values are adjusted for col in adjust: indexer = StringIndexer(inputCol=col, outputCol="{}_num".format(col)) features.append("{}_num".format(col)) df = indexer.fit(df).transform(df) ## Features vector configured from dataframe for model processing assembler = VectorAssembler(inputCols=features, outputCol="features") assembled = assembler.transform(df) gr = GeneralizedLinearRegression(featuresCol='features', labelCol=label, regParam=0.3, family="poisson") grModel = gr.fit(assembled) predictions = grModel.transform(assembled) ## Evaluator required for rmse estimation evaluator = RegressionEvaluator(labelCol=label, metricName="rmse") rmse = evaluator.evaluate(predictions) result = { "RMSE": rmse, "predictions": [r["prediction"] for r in predictions.select("prediction").collect()] } return result
def logisT(value): glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(value) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) return (model.coefficients,1)
def model_dev_glm(df_train, df_test, max_iter, fit_intercept, reg_param): glm_start_time = time() # Create an Initial Model Instance mod_glm = GeneralizedLinearRegression(labelCol='label', featuresCol='features', family="gaussian", link="identity", fitIntercept=fit_intercept, maxIter=max_iter, regParam=reg_param) # Training The Model glm_final_model = mod_glm.fit(df_train) # Scoring The Model On Test Sample glm_transformed = glm_final_model.transform(df_test) glm_test_results = glm_transformed.select(['prediction', 'label']) # Collecting The Model Statistics glm_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label") glm_r2 = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "r2"}), 3) glm_mse = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "mse"}), 3) glm_rmse = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "rmse"}), 3) glm_mae = round( glm_evaluator.evaluate(glm_test_results, {glm_evaluator.metricName: "mae"}), 3) # Printing The Model Statitics print("\n++++++ Printing Generalized Linear Model Accuracy ++++++\n") print("R Square: " + str(glm_r2 * 100) + "%") print("Mean Squared Error: " + str(glm_mse)) print("Root Mean Squared Error: " + str(glm_rmse)) print("Mean Absolute Error: " + str(glm_mae)) glm_end_time = time() glm_elapsed_time = (glm_end_time - glm_start_time) / 60 glm_model_stat = pd.DataFrame({ "Model Name": ["Generalized Linear Model"], "R Square": glm_r2, "Mean Squared Error": glm_mse, "Root Mean Squared Error": glm_rmse, "Mean Absolute Error": glm_mae, "Time (Min.)": round(glm_elapsed_time, 3) }) glm_output = (glm_final_model, glm_model_stat) return (glm_output)
def generalized_linear_regression(trainingDataFrame, family="gaussian", link="identity", maxIter=10, regParam=0.3): glr = GeneralizedLinearRegression(family=family, link=link, maxIter=maxIter, regParam=regParam) glrModel = glr.fit(trainingDataFrame) result = {} result["model"] = glrModel result["summary"] = glrModel.summary result["intercept"] = glrModel.intercept result["coefficients"] = glrModel.coefficients return result
def main(self, sc, *args): points_rdd = self.requires().get_points_rdd(sc) model = GeneralizedLinearRegression(family='poisson', link=self.link, maxIter=self.iterations) spark_sql = SparkSession.builder.getOrCreate() model = model.fit(spark_sql.createDataFrame(points_rdd)) model.save(self.output().path)
def linear_regression(ticker, writer): spark = SparkSession\ .builder\ .appName("GeneralizedLinearRegressionExample")\ .getOrCreate() # $example on$ # Load training data dataset1 = spark.read.format("libsvm")\ .load("../data/newlr/" + ticker + "_no_today.csv") glr1 = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model1 = glr1.fit(dataset1) with open("../data/tickers/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') count = 0 for row in reader: if count == 1: today_volume = row[5] count = count + 1 else: count = count + 1 # Print the coefficients and intercept for generalized linear regression model predict_close_value = -1 * float(str(model1.coefficients[0])) + float( str(today_volume)) * float(str(model1.coefficients[1])) + float( str(model1.intercept)) print(predict_close_value) today_close_value = 0 yesterday_close_value = 0 with open("../data/newlr/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',') count = 0 for row in reader: if count is 0: today_close_value = row[0] count += 1 elif count is 1: yesterday_close_value = row[0] break spark.stop() if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value: return True elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value: return True else: return False
def test_offset(self): df = self.spark.createDataFrame( [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)), (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)), (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)), (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"]) glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset") model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
def regression(train_set, test_set, featuresColumn, labelColumn): regressor = GeneralizedLinearRegression(featuresCol=featuresColumn, labelCol=labelColumn, family="gaussian", link="log", maxIter=10, regParam=0.3) regressor = regressor.fit(train_set) predict_results = regressor.evaluate(test_set) result = predict_results.predictions return result
def best_subset_selection_GLM(df, labelCol, Cols, label_is_categorical=False, family='gaussian', link='identity'): print('Total number of iterations: {}'.format(2**len(Cols))) AIC_values, feature_list, num_features = [], [], [] for k in np.arange(1, len(Cols) + 1): for i, combo in enumerate(itertools.combinations(Cols, k)): continuousCols, categoricalCols = [], [] for col in list(combo): data_type = str(df.schema[col].dataType) if data_type == 'StringType': categoricalCols.append(col) else: continuousCols.append(col) data = prepare_data(df=df, labelCol=labelCol, label_is_categorical=False, categoricalCols=categoricalCols, continuousCols=continuousCols) model = GeneralizedLinearRegression(family=family, link=link, featuresCol='features', labelCol='label') AIC = model.fit(data).summary.aic AIC_values.append(AIC) feature_list.append(combo) num_features.append(len(combo)) print('Feature/s: {}, AIC={:.3f}'.format(combo, AIC)) return pd.DataFrame({ 'num_features': num_features, 'AIC': AIC_values, 'features': feature_list }).rename_axis('Model ID').sort_values('AIC', ascending=False)
def test_tweedie_distribution(self): df = self.spark.createDataFrame( [(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6) model = glr.fit(df) self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4)) self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4)) model2 = glr.setLinkPower(-1.0).fit(df) self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4)) self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
def train_fit_glmm(window, date_label: str): poisson_regression = GeneralizedLinearRegression(family="poisson", link="log", maxIter=10, regParam=0.3) columns = [ denoise("train").alias("features"), F.expr(f"{date_label} as label") ] model = poisson_regression.fit(window.select(*columns)) # TODO: may want to persist the fitted model observations = model.transform( window.withColumn("features", denoise("retrain"))) columns = ["page_id", "train", "validate", "retrain", "test", "prediction"] return observations.select(*columns)
def test_glr_summary(self): from pyspark.ml.linalg import Vectors df = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"], ) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.numInstances, 2) self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue( isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue( isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue( isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, str)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned # The child class GeneralizedLinearRegressionTrainingSummary runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def test_glr_load(self): df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0))], ["label", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") model = glr.fit(df) self.assertEqual(model.getSolver(), "irls") transformed1 = model.transform(df) path = tempfile.mkdtemp() model_path = path + "/glr" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) self.assertEqual(model2.getSolver(), "irls") transformed2 = model2.transform(df) self.assertEqual(transformed1.take(4), transformed2.take(4))
def linear_regression(ticker,writer): spark = SparkSession \ .builder \ .appName("GeneralizedLinearRegressionExample") \ .getOrCreate() # Load training data dataset = spark.read.format("libsvm").load("../data/lr/" + ticker + "_no_today.csv") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=1, regParam=0.8) # Fit the model model = glr.fit(dataset) data=[ticker, 'coefficient:', model.coefficients[0],'intercept:',model.intercept] writer.writerow(data) print(data) # predict today_close_value = 0 yesterday_close_value = 0 with open("../data/lr/" + ticker + ".csv") as csvfile: reader = csv.reader(csvfile, delimiter=',') count = 0 for row in reader: if count is 0: today_close_value = row[0] count += 1 elif count is 1: yesterday_close_value = row[0] break # # print(today_close_value) # # print(yesterday_close_value) predict_close_value = -1 * float(str(model.coefficients[0])) + float(str(model.intercept)) # print(predict_close_value) spark.stop() if predict_close_value >= yesterday_close_value and today_close_value >= yesterday_close_value: return True elif predict_close_value <= yesterday_close_value and today_close_value <= yesterday_close_value: return True else: return False
def generalized_linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ (1.0, Vectors.dense(0.0, 0.0)), (1.0, Vectors.dense(1.0, 2.0)), (2.0, Vectors.dense(0.0, 0.0)), (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"]) glr = GeneralizedLinearRegression( family="gaussian", link="identity", ) # linkPredictionCol="p") model = glr.fit(df) transformed = model.transform(df) abs(transformed.head().prediction - 1.5) < 0.001 # True abs(transformed.head().p - 1.5) < 0.001 # True model.coefficients model.numFeatures # 2 abs(model.intercept - 1.5) < 0.001 # True temp_path = "./" glr_path = temp_path + "/glr" glr.save(glr_path) glr2 = GeneralizedLinearRegression.load(glr_path) glr.getFamily() == glr2.getFamily() # True model_path = temp_path + "/glr_model" model.save(model_path) model2 = GeneralizedLinearRegressionModel.load(model_path) model.intercept == model2.intercept # True model.coefficients[0] == model2.coefficients[0]
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("GeneralizedLinearRegressionExample")\ .getOrCreate() # $example on$ # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull)) print("Deviance: " + str(summary.deviance)) print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
def Train(self): st_global = time.time() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x["algorithmSlug"]==GLOBALSETTINGS.MODEL_SLUG_MAPPING["generalizedlinearregression"],algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) glinr = GeneralizedLinearRegression(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(glinr.regParam, [0.1, 0.01]) \ .addGrid(glinr.fitIntercept, [False, True])\ .build() crossval = CrossValidator(estimator=glinr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = glinr.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit print bestModel.explainParams() print bestModel.extractParamMap() print bestModel.params print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() # modelPmmlPipeline = PMMLPipeline([ # ("pretrained-estimator", objs["trained_model"]) # ]) # try: # modelPmmlPipeline.target_field = result_column # modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) # sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) # pmmlfile = open(pmml_filepath,"r") # pmmlText = pmmlfile.read() # pmmlfile.close() # self._result_setter.update_pmml_object({self._slug:pmmlText}) # except: # pass coefficientsArray = [(name, bestModel.coefficients[idx]) for idx, name in featureMapping] MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Generalized Linear Regression") self._model_summary.set_algorithm_display_name("Generalized Linear Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_coefficinets_array(coefficientsArray) self._model_summary.set_feature_list(list(x_train.columns)) # print CommonUtils.convert_python_object_to_json(self._model_summary) modelSummaryJson = { "dropdown":{ "name":self._model_summary.get_algorithm_name(), "accuracy":CommonUtils.round_sig(self._model_summary.get_model_evaluation_metrics()["r2"]), "slug":self._model_summary.get_slug() }, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict() } glinrCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in glinrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"generalizedlinearregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_generalized_linear_regression_model_summary(modelSummaryJson) self._result_setter.set_glinr_cards(glinrCards)
accuracy = accuracyEval.evaluate(rfc_pred) print("accuracy of classifier is: ", accuracy) #####3b: GLM #training data filtered to just the rows with nonzero claims trainFilt = trainingData.filter(col("binaryclaim")==1) from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(featuresCol='finalvector', labelCol='Claim_Amount', regParam=0.01, family='gaussian', predictionCol = "combo_prediction") glrmodel = glr.fit(trainFilt) ###only perform GLR on rows that have been predicted to have nonzero claims predFilt = rfc_pred.filter(col("prediction")==1) combo_pred = glrmodel.transform(predFilt) #looking at a few rows: combo_pred.select("finalvector",'Claim_Amount', "binaryclaim", "prediction", "combo_prediction" ).show(100) #Now we get the MSE AND MAE from pyspark.ml.evaluation import RegressionEvaluator
#Generamos un vector con la columna label y la columna array features ignore = ['label'] assembler = VectorAssembler( inputCols=[x for x in train.columns if x not in ignore], outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (regresion logistica) model_regresion = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=50, regParam=0.05) # Fit the model model_regresion = model_regresion.fit(train_LP) # Make predictions. predictions = model_regresion.transform(evaluation_LP) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(model_regresion.coefficients)) print("Intercept: %s" % str(model_regresion.intercept)) # COMMAND ---------- # Summarize the model over the training set and print out some metrics trainingSummary = model_regresion.summary print("Coefficient Standard Errors: " +
'prediction'].rdd).areaUnderROC) ## gamma regression with predictions gam = predictions.filter(predictions.prediction > 0).filter( predictions.label > 0) glr = GeneralizedLinearRegression(labelCol="label", featuresCol="pcaFeatures", predictionCol="gammaprediction", family="gamma", link="Inverse", maxIter=10) ## Fit the model model = glr.fit(gam) gammapred = model.transform(gam) evaluator = RegressionEvaluator(labelCol="label", predictionCol="gammaprediction", metricName="r2") r2 = evaluator.evaluate(gammapred) print("Evaluating gamma prediction :") print("R2 = %g " % r2) end = time.time() print('tiempo', end - begin) sc.stop()
# Let's see how many numerical features we have: num_cols = [item[0] for item in df.dtypes if item[1].startswith('int') | item[1].startswith('double')][1:] print(str(len(num_cols)) + ' numerical features') Data = df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) Data.show() pd.DataFrame(Data.take(5), columns=Data.columns) testset,trainset = Data.randomSplit([0.3,0.7], seed=25) print("Training Dataset Count: " + str(trainset.count())) print("Test Dataset Count: " + str(testset.count())) ### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01) model = glr.fit(Data) summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("P Values: " + str(summary.pValues)) #Removing all the columns that had a p-value above 0.05 vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41]) Training_set= vs.transform(trainset) Test_set = vs.transform(testset) #### LOGISTIC REGRESSION logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial") logReg_model = logReg.fit(Training_set) trainingSummary = logReg_model.summary roc = trainingSummary.roc.toPandas() print('Training set ROC: ' + str(trainingSummary.areaUnderROC))
def binomialSparkGLF(self): regr = GeneralizedLinearRegression() model = regr.fit(self.Xtrain, self.Ytrain) return model
def scalarSparkGLR(self): regr = GeneralizedLinearRegression() model = regr.fit(self.train) return model
# Random Spliting training, testing = modelprep2.randomSplit([0.8, 0.2]) #modelprep2.count() #training.count() #testing.count() ####################################################################################### # # Modeling - GLM (Regression) # ####################################################################################### glm = GeneralizedLinearRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3) glmmodel = glm.fit(training) summary = glmmodel.summary # Show Coefficients and Intercept print("\nFeatures: " + str(features) + "\n") print("\nCoefficients: " + str(glmmodel.coefficients) + "\n") print("\nIntercept: " + str(glmmodel.intercept) + "\n") print("\nPValues: " + str(summary.pValues) + "\n") # Summarize the model over the training set and print out some metrics #print("\nCoefficient Standard Errors: " + str(summary.coefficientStandardErrors)) #print("T Values: " + str(summary.tValues)) #print("P Values: " + str(summary.pValues)) #print("Dispersion: " + str(summary.dispersion)) #print("Null Deviance: " + str(summary.nullDeviance))
metricName="areaUnderROC") auc = evaluator.evaluate(logisticReg_prediction) end = time.time() print('Logistic Regression Execution time:', end - start) print("auc = %g" % auc) train_notzero = trainingData.filter('not_zero != 0') test_notzero = testData.filter('not_zero != 0') #training glm model from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml.evaluation import RegressionEvaluator glm_poisson = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=10, regParam=0.01,\ family='Gamma', link='identity') start = time.time() glm_model = glm_poisson.fit(train_notzero) #select zero sample pred_zero = logisticReg_prediction.filter('prediction == 0') pred_zero = pred_zero.withColumn('claim_prediction', pred_zero['not_zero'] * 0).select( 'Claim_Amount', 'claim_prediction') #extract non zero value pred_nonzero = logisticReg_prediction.filter('prediction != 0') pred_nonzero = pred_nonzero.select('features', 'Claim_Amount') #compare model with non zero value pred_amount = glm_model.transform(pred_nonzero) pred_amount = pred_amount.select('Claim_Amount', 'prediction') pred_amount = pred_amount.withColumnRenamed('prediction', 'claim_prediction')
print summary.objectiveHistory print summary.rootMeanSquaredError print summary.r2 # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression()\ .setFamily("gaussian")\ .setLink("identity")\ .setMaxIter(10)\ .setRegParam(0.3)\ .setLinkPredictionCol("linkOut") print glr.explainParams() glrModel = glr.fit(df) # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor()
# 划分训练集,集测试集 vdata = v_data.select(['features', 'medv']) vdata.show(10) splits = vdata.randomSplit([0.7, 0.3]) train_data = splits[0] test_data = splits[1] # 训练 glr = GeneralizedLinearRegression(family="gaussian", link="identity", labelCol='medv', featuresCol='features', maxIter=1000, regParam=0.3) # Fit the model GlModel = glr.fit(train_data) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(GlModel.coefficients)) print("Intercept: " + str(GlModel.intercept)) # Summarize the model over the training set and print out some metrics summary = GlModel.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull)) print("Deviance: " + str(summary.deviance)) print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom)) print("AIC: " + str(summary.aic)) print("Deviance Residuals: ")
# Conver the label of data which has non-zero label to 1 from pyspark.sql.functions import when train_set1 = training_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0)) test_set1 = test_set.withColumn('Claim_Amount',when(dataVectorised.Claim_Amount!=0, 1).otherwise(0)) # The binary classifier model_start = time.time() from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol='features', labelCol='Claim_Amount', maxDepth=5, numTrees=3, seed=myseed) RFC_model = rfc.fit(train_set1) # Gamma Regressor from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', link='identity') GLR_model = glr.fit(data_Claim_else) # Combine the two model predict_RFC = RFC_model.transform(test_set) # select the results which predicted as 1 RFC_result = predict_RFC[predict_RFC['prediction']==1].select('features','Claim_Amount') GLR_result = GLR_model.transform(RFC_result) model_end = time.time() mse = evaluatorMSE.evaluate(GLR_result) mae = evaluatorMAE.evaluate(GLR_result) print('mse :', mse) print('mae :', mae) print('Time:', model_end-model_start) spark.stop()
print("test vector assembled") test_df.show(5) # Split `train_df` into train and test sets (30% held out for testing) #Split train and test seed(0) (trainingData, testData) = train_df.randomSplit([0.7, 0.3]) # ## Logistic Regression #Fit logistic regression glr = GeneralizedLinearRegression(family="binomial", link="logit", featuresCol="features", labelCol="is_duplicate") trainLogitModel = glr.fit(trainingData) #Logistic model predictions LogitPredictions = trainLogitModel.transform(testData) # Calculate AUC evaluator = BinaryClassificationEvaluator(labelCol="is_duplicate", rawPredictionCol="prediction", metricName="areaUnderROC") AUClogit = evaluator.evaluate(LogitPredictions) print("Logistic Regression AUC = %g " % AUClogit) # ## Decision trees #Fit decision tree model #Train a DecisionTree model and make predictions dt = DecisionTreeClassifier(maxDepth=15, labelCol="is_duplicate")
cuse_df.show(5) # In[3]: # ## Split data into training and test datasets training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234) # In[4]: # ## Build Logistic Regression model from pyspark.ml.regression import GeneralizedLinearRegression logr = GeneralizedLinearRegression(family="binomial", link="logit", regParam=0.0) # Fit the model to the data and call this model logr_Model logr_Model = logr.fit(training) # Print the coefficients and intercept for linear regression summary = logr_Model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) # #### Prediction on training data pred_training_cv = logr_Model.transform(training) pred_training_cv.show(5, truncate=False) # #### Prediction on test data pred_test_cv = logr_Model.transform(test) pred_test_cv.show(5, truncate=False)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # COMMAND ---------- from pyspark.ml.regression import GeneralizedLinearRegression # Load training data dataset = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # Fit the model model = glr.fit(dataset) # Print the coefficients and intercept for generalized linear regression model print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Summarize the model over the training set and print out some metrics summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("T Values: " + str(summary.tValues)) print("P Values: " + str(summary.pValues)) print("Dispersion: " + str(summary.dispersion)) print("Null Deviance: " + str(summary.nullDeviance)) print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull)) print("Deviance: " + str(summary.deviance)) print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", predictionCol='prediction_c', maxBins=800) binarizer = Binarizer(threshold=0.0001, inputCol='Claim_Amount', outputCol='label') pipeline = Pipeline(stages=[binarizer, dt]) dtModel = pipeline.fit(traindata) # Make predictions on test data using the Transformer.transform() method. predictions = dtModel.transform(testdata) non_zero_train = traindata.filter(traindata['Claim_Amount'] > 0.0) non_zero_test = predictions.filter(predictions['prediction_c'] > 0.0) print("Generalized Linear Regression with gamma family") from pyspark.ml.regression import GeneralizedLinearRegression glm_gamma = GeneralizedLinearRegression(featuresCol='features', labelCol='Claim_Amount', maxIter=50,\ family='gamma', link='log') glm_model = glm_gamma.fit(non_zero_train) predictions = glm_model.transform(non_zero_test) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator\ (labelCol="Claim_Amount", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("RMSE = %g " % rmse) end = time.time()