def build_and_save_model(df, config): # Feature engineering categorical_cols = config["categorical_cols"] numerical_cols = config["numerical_cols"] indexed_cols = [c + '_index' for c in categorical_cols] encoded_cols = [c + '_dummy' for c in categorical_cols] indexer = StringIndexer(inputCols=categorical_cols, outputCols=indexed_cols, handleInvalid='keep') onehotencoder = OneHotEncoder(inputCols=indexed_cols, outputCols=encoded_cols) assembler = VectorAssembler(inputCols=encoded_cols, outputCol='features') standardscaler = StandardScaler(inputCol='features', outputCol='normalized_features', withMean=True) pca = PCA(inputCol='normalized_features', outputCol='pca_features') #Tune Model model_selector = 'RandomForest' if model_selector == 'ElasticNet': regressor = LinearRegression(featuresCol='pca_features') grid = ParamGridBuilder() \ .addGrid(pca.k,[2,5,10]) \ .addGrid(regressor.maxIter,[10]) \ .addGrid(regressor.regParam,[0.1,0.3]) \ .addGrid(regressor.elasticNetParam,[0.0,0.3,0.5,0.8,1.0]) \ .build() stages = [ indexer, onehotencoder, assembler, standardscaler, pca, regressor ] if model_selector == 'DecisionTree': regressor = DecisionTreeRegressor(featuresCol='pca_features') grid = ParamGridBuilder() \ .addGrid(pca.k,[2,10,20]) \ .addGrid(regressor.maxDepth,[5,10,20]) \ .addGrid(regressor.maxBins,[10]) \ .build() stages = [ indexer, onehotencoder, assembler, standardscaler, pca, regressor ] if model_selector == 'RandomForest': regressor = RandomForestRegressor(featuresCol='features') grid = ParamGridBuilder() \ .addGrid(regressor.maxDepth,[2,5,10]) \ .addGrid(regressor.maxBins,[10]) \ .addGrid(regressor.numTrees,[20]) \ .build() stages = [indexer, onehotencoder, assembler, regressor] if model_selector == "GradientBoosting": regressor = GBTRegressor(featuresCol='features') grid = ParamGridBuilder() \ .addGrid(regressor.maxDepth,[2,5]) \ .addGrid(regressor.maxBins,[8]) \ .addGrid(regressor.maxIter,[10]) \ .build() stages = [indexer, onehotencoder, assembler, regressor] pipeline = Pipeline(stages=stages) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=RegressionEvaluator(metricName='mae'), numFolds=10, parallelism=3, seed=123) model = cv.fit(df) print('mean absolute error: %s' % model.avgMetrics) print(model.bestModel.stages[-1]) model.write().overwrite().save(config["model_path"])
"number_blocks" ], outputCol='features') v_df = vectorAssembler.transform(df) #v_df.show(3) #train test split splits = v_df.randomSplit([0.8, 0.2]) train_df = splits[0] test_df = splits[1] from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol='features', labelCol='value_usd', maxIter=10, regParam=0.3, elasticNetParam=0.8) #schema for Model lr_model = lr.fit(train_df) #train model on train_df lr_predictions = lr_model.transform( test_df) #use model to make predictions on test_df lr_predictions.select("prediction", "value_usd", "features", "date").toPandas().to_csv( "predLR.csv") #safe predictions to local drive #Random Forest from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils data_full = sc.textFile(
# Convert this RDD to a DataFrame colNames = ["label", "features"] df = data.toDF(colNames) # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame. # Perhaps you're importing data from a real database. Or you are using structured streaming # to get your data. # Let's split our data into training data and testing data trainTest = df.randomSplit([0.5, 0.5]) trainingDF = trainTest[0] testDF = trainTest[1] # Now create our linear regression model lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train the model using our training data model = lir.fit(trainingDF) # Now see if we can predict values in our test data. # Generate predictions using our linear regression model for all features in our # test dataframe: fullPredictions = model.transform(testDF).cache() # Extract the predictions and the "known" correct labels. predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0]) labels = fullPredictions.select("label").rdd.map(lambda x: x[0]) # Zip them together predictionAndLabel = predictions.zip(labels).collect()
for i in range(len(finl)): # to remove duplicates from join if finl[i]['Id'] not in keyl: finll.append(finl[i]) keyl.append(finl[i]['Id']) df_fin=sqlcontext.createDataFrame(finll) df_fin.show() #print(finll) for col in df_fin.columns: if col=="rating": df_fin = df_fin.withColumn(col,df_fin[col].cast('float')) assembler=VectorAssembler(inputCols=['age'],outputCol='features') output=assembler.transform(df_fin) final_data=output.select('features','rating') train_data,test_data=final_data.randomSplit([0.7,0.3]) rating_lr=LinearRegression(featuresCol='features',labelCol='rating') train_rating_model=rating_lr.fit(train_data) rating_results=train_rating_model.evaluate(train_data) unlabeled_data=test_data.select('features') predictions=train_rating_model.transform(unlabeled_data) #pp=sqlcontext.read.json('inp1.json') #pp.show() #data=pp.map(lambda x: x.split(',')) #r=data.collect() #print(r) #import json with open('inp1.json') as f: data = json.load(f) match_date=data['date']
def scalarSparkLinearRegression(self): regr = LinearRegression() model = regr.fit(self.train) return model
def test_write_property(self): lr = LinearRegression(maxIter=1) self.assertTrue(isinstance(lr.write, MLWriter))
def main(): # LOAD THE DATA total_features, total_prices = load_boston(True) col_list = load_boston()['feature_names'] df = pd.DataFrame(total_features) df.columns = col_list df['price'] = total_prices print(df.head()) # save to csv df.to_csv('boston.csv', index=False) # load data with spark way data = sc.textFile('boston.csv').map(lambda line: line.split(",")) headers = data.first() traindata = data.filter(lambda row: row != headers) sqlContext = SQLContext(sc) dataFrame = sqlContext.createDataFrame(traindata, [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'price' ]) dataFrame.take(2) print('-' * 70) print(dataFrame.take(30)) print('-' * 70) # convert string to float in PYSPARK # https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame #for col in df.columns: # dataFrame = dataFrame.withColumn("{}".format(), dataFrame[col].cast("float")) dataFrame = dataFrame.withColumn("CRIM_", dataFrame["CRIM"].cast("float")) dataFrame = dataFrame.withColumn("ZN_", dataFrame["ZN"].cast("float")) dataFrame = dataFrame.withColumn("price_", dataFrame["price"].cast("float")) dataFrame.registerTempTable("temp_sql_table") spark_sql_output = sqlContext.sql("""SELECT CRIM_, ZN_, price_ FROM temp_sql_table """) print(spark_sql_output.take(10)) trainingData = spark_sql_output.rdd.map( lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"]) trainingData.show() featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(trainingData) (trainingData, testData) = trainingData.randomSplit([0.7, 0.3]) #################### SPARK ML #################### # Define LinearRegression algorithm lr = LinearRegression() # Fit 2 models, using different regularization parameters modelA = lr.fit(trainingData, {lr.regParam: 0.0}) modelB = lr.fit(trainingData, {lr.regParam: 100.0}) # Make predictions predictionsA = modelA.transform(trainingData) print('-' * 70) print('MODEL A : ') predictionsA.select("prediction", "label", "features").show(30) print('-' * 70) predictionsB = modelB.transform(trainingData) print('-' * 70) print('MODEL B : ') predictionsB.select("prediction", "label", "features").show(30) print('-' * 70) # Evaluate the model evaluator = RegressionEvaluator(metricName="rmse") RMSE = evaluator.evaluate(predictionsA) print('-' * 70) print("ModelA: Root Mean Squared Error = " + str(RMSE)) print('-' * 70) # ModelA: Root Mean Squared Error = 128.602026843 RMSE = evaluator.evaluate(predictionsB) print('-' * 70) print("ModelB: Root Mean Squared Error = " + str(RMSE)) print('-' * 70)
df = spark.createDataFrame( [ (3.0, 1.0), ## y, x (7.0, 3.0), (5.0, 2.0) ], ["y", "x"]) # 转换x列为Vector[] assembler = VectorAssembler(inputCols=["x"], outputCol="features") dfv = assembler.transform(df) print(len(dfv.rdd.take(1)[0])) dfv.show() lr = LinearRegression(labelCol='y', maxIter=50, regParam=0.01, solver="normal", fitIntercept=True) model = lr.fit(dfv) print("intercept: ", model.intercept) # 偏置b print("---> ", model.coefficients) # 各特征变量的系数a test0 = spark.createDataFrame([(4, Vectors.dense(4.0)), (5, Vectors.dense(5.0))], ["id", "features"]) prediction = model.transform(test0) # 预测 prediction.show() selected = prediction.select("id", "features", "prediction") for row in selected.collect(): print(row) # print(abs(model.transform(test0).head().prediction - (-1.0)) < 0.001)
# Vector assembling features. inputCols_assem = [x for x in df_r.columns if x not in ['id', 'label']] assembler = VectorAssembler(\ inputCols = inputCols_assem, \ outputCol = 'features') df_r = assembler.transform(df_r) # Train test split the Data. train, test = df_r.randomSplit([0.8, 0.2], seed=12345) print 'Finished data preprocessing...' # Fitting & predicting pipeline. evaluator = RegressionEvaluator(metricName="mae") # lr = LinearRegression().setSolver("l-bfgs") lr = LinearRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [500]) \ .addGrid(lr.regParam, [0]) \ .addGrid(lr.elasticNetParam, [1]) \ .build() lr_cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, \ evaluator=evaluator, numFolds=3) lrModel = lr_cv.fit(train) # Takes 30 min to run. bestModel = lrModel.bestModel print 'MAE: ', lrModel.avgMetrics print 'Best Param (regParam): ', bestModel._java_obj.getRegParam() print 'Best Param (MaxIter): ', bestModel._java_obj.getMaxIter() print 'Best Param (elasticNetParam): ', bestModel._java_obj.getElasticNetParam( ) print 'Param Map: ', bestModel._java_obj.extractParamMap() bestModel.save(
####################### # LOG HYPERPARAMETERS # ####################### run.log("Model Name", model_name) run.log("Max Iterations", maxIters) run.log("Regularization Rate", regParam) run.log_list("Feature Columns", feature_cols) ############### # TRAIN MODEL # ############### print(" * Training {0} model".format(model_name)) # Instantiate New LinearRegression Object lr = LinearRegression(featuresCol='features', labelCol='duration_minutes', maxIter=maxIters, regParam=regParam, solver="auto") # Train model on transformed training data lr_model = lr.fit(trainDF_transformed) lr_full_model = feature_model.copy() lr_full_model.stages.append(lr_model) print(" * Model trained, scoring validation data") # Run the full model (feature steps and trained model) validation_scored = lr_full_model.transform(validDF) ##################### # MODEL PERFORMANCE # #####################
model_df=features_df.select('features','output') # 构建用于线性回归的数据模型 # 5-将数据划分为 训练数据和预测数据 train_df,test_df=model_df.randomSplit([0.7,0.3]) # 训练数据和预测数据的比例为 7比3 print((train_df.count(), len(train_df.columns))) print((test_df.count(), len(test_df.columns))) # 6-构建线性回归模型 from pyspark.ml.regression import LinearRegression # 导入线性回顾库 print('-------------- 构建线性回归模型 ------------------') lin_Reg=LinearRegression(labelCol='output') # labelCol,相对于featrues列,表示要进行预测的列 lr_model=lin_Reg.fit(train_df) # 训练数据 ,fit返回一个 fitted model,即LineRegressionModel对象 print('{}{}'.format('方程截距:',lr_model.intercept)) # intercept 线性方程的截距。 print('{}{}'.format('方程参数系数:',lr_model.coefficients)) # 回归方程中的,变量参数 ,这里分别对应var_1,var_2,var_3,var_4,var_5 training_predictions=lr_model.evaluate(train_df) # 查看预测数据 print('{}{}'.format('误差差值平方:',training_predictions.meanSquaredError)) # 误差值差值平方 print('{}{}'.format('判定系数:',training_predictions.r2 )) # r2 判定系数,用来判定,构建的模型是否能够准确的预测,越大说明预测的准确率越高 # 7-使用预测数据,用已经到构建好的预测模型 lr_model test_results=lr_model.evaluate(test_df)
def test_param_search_estimator( # pylint: disable=unused-argument metric_name, param_search_estimator, spark_session, dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) lrParamMaps = [ { lr.maxIter: 1, lr.standardization: False }, { lr.maxIter: 200, lr.standardization: True }, { lr.maxIter: 2, lr.standardization: False }, ] best_params = { "LinearRegression.maxIter": 200, "LinearRegression.standardization": True } eva = RegressionEvaluator(metricName=metric_name) estimator = param_search_estimator(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mlflow.start_run() as run: model = estimator.fit(dataset_regression) estimator_info = load_json_artifact("estimator_info.json") metadata = _gen_estimator_metadata(estimator) assert metadata.hierarchy == estimator_info["hierarchy"] param_search_estiamtor_info = estimator_info[ metadata.uid_to_indexed_name_map[estimator.uid]] assert param_search_estiamtor_info[ "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively( lr, 1, metadata.uid_to_indexed_name_map) assert param_search_estiamtor_info[ "tuning_parameter_map_list"] == _get_tuning_param_maps( estimator, metadata.uid_to_indexed_name_map) assert best_params == load_json_artifact("best_parameters.json") search_results = load_json_csv("search_results.csv") uid_to_indexed_name_map = metadata.uid_to_indexed_name_map run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(estimator, uid_to_indexed_name_map), **{f"best_{k}": v for k, v in best_params.items()}, })) assert run_data.tags == get_expected_class_tags(estimator) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == model.uid loaded_best_model = load_model_by_run_id(run_id, "best_model") assert loaded_best_model.stages[0].uid == model.bestModel.uid assert run_data.artifacts == [ "best_model", "best_parameters.json", "estimator_info.json", "model", "search_results.csv", ] client = mlflow.tracking.MlflowClient() child_runs = client.search_runs( run.info.experiment_id, "tags.`mlflow.parentRunId` = '{}'".format(run_id)) assert len(child_runs) == len(search_results) for row_index, row in search_results.iterrows(): row_params = json.loads(row.get("params", "{}")) for param_name, param_value in row_params.items(): assert param_value == row.get(f"param.{param_name}") params_search_clause = " and ".join([ "params.`{}` = '{}'".format(key.split(".")[1], value) for key, value in row_params.items() ]) search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format( run_id, params_search_clause) child_runs = client.search_runs(run.info.experiment_id, search_filter) assert len(child_runs) == 1 child_run = child_runs[0] assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED) run_data = get_run_data(child_run.info.run_id) child_estimator = estimator.getEstimator().copy( estimator.getEstimatorParamMaps()[row_index]) assert run_data.tags == get_expected_class_tags(child_estimator) assert run_data.params == truncate_param_dict( stringify_dict_values({ **_get_instance_param_map(child_estimator, uid_to_indexed_name_map) })) assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) == mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME) metric_name = estimator.getEvaluator().getMetricName() if isinstance(estimator, CrossValidator): avg_metric_value = model.avgMetrics[row_index] avg_metric_name = f"avg_{metric_name}" else: avg_metric_value = model.validationMetrics[row_index] avg_metric_name = metric_name assert math.isclose(avg_metric_value, run_data.metrics[avg_metric_name], rel_tol=1e-6) assert math.isclose(avg_metric_value, float(row.get(avg_metric_name)), rel_tol=1e-6) if isinstance(estimator, CrossValidator) and Version( pyspark.__version__) >= Version("3.3"): std_metric_name = f"std_{metric_name}" std_metric_value = model.stdMetrics[row_index] assert math.isclose(std_metric_value, run_data.metrics[std_metric_name], rel_tol=1e-6) assert math.isclose(std_metric_value, float(row.get(std_metric_name)), rel_tol=1e-6)
def test_attr_spark(self): conf = SparkConf().setAppName("toy_test").setMaster('local[2]') num_partitions = 2 enumerator = "join" model_type = "regression" label = 'target' sparkContext = SparkContext(conf=conf) sqlContext = SQLContext(sparkContext) train_df = sqlContext.read.csv("toy_train.csv", header='true', inferSchema='true') test_df = sqlContext.read.csv("toy.csv", header='true', inferSchema='true') # initializing stages of main transformation pipeline stages = [] # list of categorical features for further hot-encoding cat_features = ['a', 'b', 'c'] for feature in cat_features: string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index").setHandleInvalid("skip") encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"]) encoder.setDropLast(False) stages += [string_indexer, encoder] assembler_inputs = [feature + "_vec" for feature in cat_features] assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") stages += [assembler] assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features") stages += [assembler_final] pipeline = Pipeline(stages=stages) train_pipeline_model = pipeline.fit(train_df) test_pipeline_model = pipeline.fit(test_df) train_df_transformed = train_pipeline_model.transform(train_df) test_df_transformed = test_pipeline_model.transform(test_df) train_df_transformed = train_df_transformed.withColumn('model_type', sf.lit(0)) test_df_transformed = test_df_transformed.withColumn('model_type', sf.lit(0)) decode_dict = {} counter = 0 cat = 0 for feature in cat_features: colIdx = test_df_transformed.select(feature, feature + "_index").distinct().rdd.collectAsMap() colIdx = {k: v for k, v in sorted(colIdx.items(), key=lambda item: item[1])} for item in colIdx: decode_dict[counter] = (cat, item, colIdx[item], counter) counter = counter + 1 cat = cat + 1 train_df_transform_fin = train_df_transformed.select('features', label, 'model_type') test_df_transform_fin = test_df_transformed.select('features', label, 'model_type') lr = LinearRegression(featuresCol='features', labelCol=label, maxIter=10, regParam=0.0, elasticNetParam=0.8) lr_model = lr.fit(train_df_transform_fin) eval = lr_model.evaluate(test_df_transform_fin) f_l2 = eval.meanSquaredError pred = eval.predictions pred_df_fin = pred.withColumn('error', spark_utils.calc_loss(pred[label], pred['prediction'], pred['model_type'])) predictions = pred_df_fin.select('features', 'error').repartition(num_partitions) converter = IndexToString(inputCol='features', outputCol='cats') all_features = list(decode_dict) predictions = predictions.collect() spark_join = spark_slicer.parallel_process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="join") spark_union = spark_union_slicer.process(all_features, predictions, f_l2, sparkContext, debug=self.debug, alpha=self.alpha, k=self.k, w=self.w, loss_type=self.loss_type, enumerator="union") self.assertEqual(3, len(spark_join.slices)) print("check1") self.assertEqual(spark_join.min_score, spark_union.min_score) print("check2") self.assertEqual(spark_join.keys, spark_union.keys) print("check3") self.assertEqual(len(spark_join.slices), len(spark_union.slices)) print("check4") idx = -1 for sliced in spark_join.slices: idx += 1 self.assertEqual(sliced.score, spark_union.slices[idx].score) print("check5")
def pipeline(request): unique_fields = custom_fields(request) date_column = CustomFields.objects.first() date_column = date_column.date_column # First, read the data data_df = read_df(request, 'clean') json_df = data_df.toPandas() json_df.to_json() # Cast all the columns to numeric new_df = data_df.select( [col(c).cast("double").alias(c) for c in data_df.columns]) new_df = new_df.fillna(0.0) new_df.show() # Split data into training and test sets train, test = new_df.randomSplit([0.7, 0.3]) # Feature Processing featuresCols = new_df.columns featuresCols.remove(unique_fields['prediction']) try: featuresCols.remove(date_column) except: pass # This concatenates all feature columns into a single feature vector in a new column 'rawFeatures' vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol='rawFeatures') # Model Training standardScaler = StandardScaler(inputCol="rawFeatures", outputCol="features") lr = LinearRegression(labelCol=unique_fields['prediction'], maxIter=10, regParam=.01) # Model tuning paramGrid = ParamGridBuilder() \ .addGrid(lr.maxIter, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ .build() # We define an evaluation metric. # This tells CrossValidator how well we are doing by comparing the true labels with predictions evaluator = RegressionEvaluator(metricName="rmse", labelCol=lr.getLabelCol(), predictionCol=lr.getPredictionCol()) # Declare the CrossValidator which runs model tuning for us. cv = CrossValidator(estimator=lr, evaluator=evaluator, estimatorParamMaps=paramGrid) stages = [vectorAssembler, standardScaler, cv] # Train the pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(train) predictions = model.transform(test) rmse = evaluator.evaluate(predictions) print("RMSE on our test set is: " + str(rmse)) predictions.show() predicted_df = predictions.toPandas() predicted_df.to_json() # rmse = 23 context = {'all_data': json_df, 'rmse': rmse, 'predicted': predicted_df} return render(request, 'show_predictions.html', context)
inferSchema=True) ad.show(5) # In[3]: # Transform data structure from pyspark.ml.linalg import Vectors ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[1:4]), x[-1]]).toDF( ['features', 'label']) ad_df.show(5) # In[4]: # Build linear regression model from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol='features', labelCol='label') # In[5]: # Fit the model lr_model = lr.fit(ad_df) # In[6]: # Prediction pred = lr_model.transform(ad_df) pred.show(5) # In[7]: # Module evaluation from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
# Feature scaling standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") scaler = standardScaler.fit(df) scaled_df = scaler.transform(df) # Split data into training and test train_data, test_data = scaled_df.randomSplit([.7, .3], seed=1234) train_data = train_data.select("features_scaled", "label") test_data = test_data.select("features_scaled", "label") train_data = train_data.withColumnRenamed("features_scaled", "features") test_data = test_data.withColumnRenamed("features_scaled", "features") # Declare regression ML model lr = LinearRegression(labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8) # Train model on training data linearModel = lr.fit(train_data) # Test model on test data predicted = linearModel.transform(test_data) predictions = predicted.select("prediction").rdd.map(lambda x: x[0]) labels = predicted.select("label").rdd.map(lambda x: x[0]) predictionAndLabel = predictions.zip(labels).collect() print predictionAndLabel[:5] # model stats #linearModel.coefficients #linearModel.intercept
# Check column data types print('\n', cars.dtypes, '\n') assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'], outputCol='features') cars = assembler.transform(cars) kars = cars.select('consumption', 'features') print(kars.toPandas().sample(12)) # Split the data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) regression = LinearRegression(labelCol='consumption').fit(kars_train) # Create predictions for the testing data and take a look at the predictions predictions = regression.transform(kars_test) print("\nStandard Linear Regression") #print("\nStandard Linear Regression\nSample") #print(predictions.toPandas().sample(12)) # Print the coefficients and RMSE for linear regression trainingSummary = regression.summary print("Coefficients: %s" % str(regression.coefficients)) print("RMSE: %f" % trainingSummary.rootMeanSquaredError) # Ridge regression ridge = LinearRegression(labelCol='consumption', elasticNetParam=0, regParam=0.1).fit(kars_train) # Create predictions for the testing data and take a look at the predictions
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator stockSI = StringIndexer(inputCol="Symbol",outputCol="indexedSymbol", handleInvalid='keep') sectorSI = StringIndexer(inputCol='Sector', outputCol='indexedSector', handleInvalid='keep') industrySI = StringIndexer(inputCol='Industry', outputCol='indexedIndustry', handleInvalid='keep') stockEnc = OneHotEncoder(inputCol="indexedSymbol", outputCol="encodedSymbol") sectorEnc = OneHotEncoder(inputCol="indexedSector", outputCol="encodedSector") industryEnc = OneHotEncoder(inputCol="indexedIndustry", outputCol="encodedIndustry") catVa = VectorAssembler(inputCols=["encodedSymbol","encodedSector","encodedIndustry"], outputCol="catFeatures") numVa = VectorAssembler(inputCols["yVolume",'ySpread',"yPriceChange",], outputCol="numFeatures") splits = [-float("inf"), -0.2, -0.15, -0.1, -0.05, -0.0, 0.05, 0.1, 0.2, float("inf")] norm = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) stockLr = LinearRegression(labelCol='PriceChange', featuresCol='features', predictionCol='pPriceChange') stages = [stockSI, sectorSI, industrySI, stockEnc, sectorEnc, industryEnc, va, norm] lrPipeline = Pipeline(stages=stages+[stockLr]) (trainingData, testData) = priceChanges.randomSplit([0.7, 0.3]) lrModel = lrPipeline.fit(trainingData) lrPredictions = lrModel.transform(testData) # Print the coefficients and intercept for linear regression print("Coefficients: %s" % str(lrModel.stages[8].coefficients)) print("Intercept: %s" % str(lrModel.stages[8].intercept)) # Summarize the model over the training set and print out some metrics
flights_assembled = assembler.transform(flights_to_model) flights_assembled.show(5) # Randomly split the assembled data into a training # sample (70% of records) and a test sample (30% of # records): (train, test) = flights_assembled.randomSplit([0.7, 0.3]) # Import and use `LinearRegression` to specify the linear # regression model and fit it to the training sample: from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol="features", labelCol="arr_delay") lr_model = lr.fit(train) # Examine the model intercept and slope: lr_model.intercept lr_model.coefficients # Evaluate the linear model on the test sample: lr_summary = lr_model.evaluate(test) # R-squared is the fraction of the variance in the test # sample that is explained by the model:
housing_train_df.show(5) from pyspark.sql.functions import round housing_train_df = housing_train_df.withColumn("label", round('log_price', 4)) print("-----%%-----") housing_train_df.show(5) seed = 42 train_df, test_df = housing_train_df.randomSplit( [0.7, 0.3], seed=seed) from pyspark.ml.regression import LinearRegression linreg = LinearRegression(maxIter=500, regParam=0.0) lm = linreg.fit(train_df) print("Intercept ", lm.intercept) print("Coefficients ", lm.coefficients) y_pred = lm.transform(test_df) y_pred.select('features', 'label', 'prediction').show(5) from pyspark.sql.functions import exp y_pred = y_pred.withColumn("y_pred", exp('prediction')) y_pred.show(5)
featureAssembler = VectorAssembler(inputCols=[ "wheel-base", "length", "width", "height", "curb-weight", "engine-size", "compression-ratio", "city-mpg", "highway-mpg" ], outputCol='features') output = featureAssembler.transform(training) splits = output.randomSplit([0.7, 0.3]) train_df = splits[0] test_df = splits[1] lr = LinearRegression(featuresCol='features', labelCol='price', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_df) print("Coefficients: " + str(lr_model.coefficients)) print("Intercept: " + str(lr_model.intercept)) trainingSummary = lr_model.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) # # Fit the model # lrModel = lr.fit(training) # # # Print the coefficients and intercept for linear regression # print("Coefficients: %s" % str(lrModel.coefficients))
def train(p_lag, dataFrame, model_path=None): # print(p_lag) current_lag = p_lag # df_len_ori: number of variables in model, K x_list = dataFrame.columns # print('x_list',x_list) df_len_ori = len(x_list) # print("df_len_ori is ") # print(df_len_ori) dataFrame_names = dataFrame.columns dataFrame = dataFrame.withColumn("id", monotonically_increasing_id()) # dataFrame.printSchema() # dataFrame.show(10) # Here, VAR model regression_type is "const" same to R VAR library, and the default in Python VAR library # w = Window().partitionBy().orderBy(col("id")) w = Window().partitionBy().orderBy(col("id")) df_len = len(dataFrame.columns) ys_lagged_list = ["const"] # Making sure first column is not considered for forecasting for i in range(1, p_lag + 1): for j in range(0, df_len - 1): # making sure index column is not considered as feature column if x_list[j] != 'TimeStamp': ys_lagged_list.append("%st-%s" % (x_list[j], str(i))) print('2', ys_lagged_list) dataFrame = dataFrame.withColumn( "%st-%s" % (x_list[j], str(i)), lag(dataFrame[j], i, 0).over(w)) # print('3') # print("Showing DataFrame") dataFrame.show(5) print('ys_lagged_list', ys_lagged_list) # add "const" column of value 1 to get intercept when fitting the regression model dataFrame = dataFrame.withColumn("const", lit(1)) dataFrame = dataFrame.withColumn("const", lag("const", p_lag, 0).over(w)) dataFrame = dataFrame.withColumn("rid", monotonically_increasing_id()) dataFrame = dataFrame.filter(dataFrame.rid >= p_lag) # dataFrame.show(5) # build ys_lagged dataframe, will be used in F-test ys_lagged = dataFrame.select(ys_lagged_list) ys_lagged_len = ys_lagged.count() # print('ye dikhai lagged value') # ys_lagged.show(10) dataFrame = dataFrame.drop('id') dataFrame = dataFrame.drop('rid') dataFrame = dataFrame.drop('const') input_feature_name = dataFrame.schema.names # input_feature_name.remove("id") for x_name in x_list: input_feature_name.remove('{}'.format(x_name)) # assemble the vector for MLlib linear regression assembler_for_lag = VectorAssembler(inputCols=input_feature_name, outputCol="features") # a = {} # b = {} models = {} lrModels = [] # print('Iterating the features') for select_y in x_list: if select_y != 'TimeStamp': model_key = '{}'.format(select_y) # ML model will be trained for each micro batch if existing model is not provided # print('model path',model_path+ '{}'.format(select_y)) lr = LinearRegression(featuresCol='features', labelCol='{}'.format(select_y), maxIter=1000, fitIntercept=True) pipeline = Pipeline(stages=[assembler_for_lag, lr]) model_val = pipeline.fit(dataFrame) # model_val.write().overwrite().save(model_path+'{}'.format(select_y)) lrModels.append('{}'.format(select_y)) models['{}'.format(select_y)] = model_val return lrModels, models
bin/spark-submit examples/src/main/python/ml/train_validation_split.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TrainValidationSplit")\ .getOrCreate() # $example on$ # Prepare training and test data. data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.9, 0.1], seed=12345) lr = LinearRegression(maxIter=10) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.fitIntercept, [False, True])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid,
return v_assembler.transform(data) if __name__ == "__main__": train_ratio = 0.8 test_ratio = 1 - train_ratio # create SparkSession - the entry to the cluster spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("Linear regression with pipeline - Boston").getOrCreate() data = prepare_data("BostonHousing.csv") # split data into train and test DataFrames train, test = data.randomSplit([train_ratio, test_ratio]) poly_exp = PolynomialExpansion(degree=3, inputCol="features", outputCol="poly_features") lr = LinearRegression(regParam=0.1, featuresCol="poly_features") pipeline = Pipeline(stages=[poly_exp, lr]) # fit the model model = pipeline.fit(train) evaluator = RegressionEvaluator() prediction_and_labels = model.transform(train).select("prediction", "label") print("Precision train: " + str(evaluator.evaluate(prediction_and_labels))) prediction_and_labels = model.transform(test).select("prediction", "label") print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))
def binomialSparkLinearRegression(self): regr = LinearRegression() model = regr.fit(self.Xtrain, self.Ytrain) return model
], outputCol='features') output = assembler.transform(data) # adds a "features" column # dependent variable = "Yearly Amount Spent" final_data = output.select('features', 'Yearly Amount Spent') ### Train vs Test split train_data, test_data = final_data.randomSplit([0.7, 0.3]) #train_data.describe().show() #summary statistics of label column ################################### ## Fit Linear Regression Model ### ################################### # train model on training set lr = LinearRegression(labelCol='Yearly Amount Spent') lr_model = lr.fit(train_data) # predict test set test_results = lr_model.evaluate(test_data) ################################### ## EVALUATE LINEAR REGRESSION #### ################################### #test_results.residuals.show() # residuals print(test_results.rootMeanSquaredError) # root mean squared error print(test_results.r2) # R squared print(test_results.meanAbsoluteError) print(test_results.meanSquaredError) ###########################################################
pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) selectedCols = ["close", "features"] df = df.select(selectedCols) df.printSchema() # split to train & test train, test = df.randomSplit([0.7, 0.3], seed=2018) print("Training Dataset Count: " + str(train.count())) print("Test Dataset Count: " + str(test.count())) # fit lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", labelCol="close") lrModel = lr.fit(train) trainingSummary = lrModel.summary print("RMSE: %f" % trainingSummary.rootMeanSquaredError) print("r2: %f" % trainingSummary.r2) train.describe().show() # get prediction lr_predictions = lrModel.transform(test) lr_predictions.select("prediction", "close", "features").show(5) # R2 on test lr_evaluator = RegressionEvaluator(predictionCol="prediction",
+-------+------------------+ |summary| Petal_Width| +-------+------------------+ | count| 108| | mean|1.1703703703703703| | stddev|0.7605039228590301| | min| 0.1| | max| 2.5| +-------+------------------+ """ # let's create the regression model from pyspark.ml.regression import LinearRegression lr = LinearRegression(featuresCol='features', labelCol='Petal_Width', predictionCol='prediction') # adapt it to the data lr_model = lr.fit(train) # print the coefficients print("Coefficients: {} Intercept: {}".format(lr_model.coefficients, lr_model.intercept)) # Coefficients: [-0.2384270208878119,0.20670865063951435,0.5267224329790431] Intercept: -0.03679454895160388 # let's create predictions on test data test_features = test.select('features') predictions = lr_model.transform(test_features)
import os from pyspark.sql import SparkSession from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import VectorAssembler spark = SparkSession.builder.getOrCreate() data_path = os.getcwd() + '\\data' json_df1_path = data_path + '\\example_8.json' df = spark.read.json(json_df1_path, multiLine=True) df.show() vectorAssembler = VectorAssembler(inputCols=["diameter"], outputCol="features") df_vector = vectorAssembler.transform(df) df_vector.show() lr = LinearRegression(featuresCol="features", labelCol="diameter") lrModel = lr.fit(df_vector) lrModel.coefficients lrModel.intercept lrModel.summary.rootMeanSquaredError
# In[136]: from pyspark.ml.regression import RandomForestRegressor, LinearRegression # This is our regressor from pyspark.ml.evaluation import RegressionEvaluator # Module to evaluate fit # In[127]: rf = RandomForestRegressor(labelCol="ups", featuresCol="features", numTrees=5) # This is our regressor="score", featuresCol="features", numTrees=5) # Create a random forest regressor # In[39]: rf = LinearRegression(labelCol="ups", featuresCol="features") # Create a linear regressor # In[128]: (trainingData, testData) = vectorDf.randomSplit([0.7, 0.3]) # In[129]: model = rf.fit(trainingData) # In[130]: predictions = model.transform(testData) # In[132]: