def runGBT(busDF='', usrDF='', revDF='', fraction=[0.7, 0.3]): print 'GBTRegressor:' print ' Rating Prediction:' gbt = GBTRegressor(maxIter=1, maxDepth=1, seed=42) businessDF, userDF, starDF = traingbt.dataClean(busDF=busDF, usrDF=usrDF, revDF=revDF) # split starDF to training data and test data trainStarDF, testStarDF = starDF.randomSplit(fraction) trainDF = traingbt.transData4GBT(businessDF, userDF, trainStarDF) model = gbt.fit(trainDF) testDF = traingbt.transData4GBT(businessDF, userDF, testStarDF) predDF = model.transform(testDF) predDF.show() errors = predDF.rdd.map(lambda x: (x.label - x.prediction)**2).collect() RMSE = math.sqrt(sum(errors) / len(errors)) print ' GBTRegressor RMSE: %.8f' % RMSE print ' Recommendation:' # recDF = traingbt.recommendation(businessDF, testStarDF, testDF, model) # recDF.printSchema() print ' Recommendation RMSE: %.8f' % RMSE
def gbdtRegression(df,arguments): from pyspark.ml.regression import GBTRegressor numTrees = 20 stepSize = 0.1 maxDepth = 5 minInstancesPerNode = 1 if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.numTrees != None: numTrees = float(arguments.numTrees) if arguments.stepSize != None: stepSize = float(arguments.stepSize) if arguments.impurity != None: impurity = arguments.impurity gbdt = GBTRegressor(maxIter=numTrees, stepSize=stepSize, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode) model = gbdt.fit(df) return model
def test_gbt_regressor(self): data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBTRegressor', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def fit(self): """ Creates the pipeline, splits the data , fits the model and save the model, also evaluates the results :return: """ cols = [ x for x in self.data.columns if x not in ['datetime', 'label', 'speed_overground'] ] assembler = VectorAssembler(handleInvalid="keep").setInputCols \ (cols).setOutputCol("features") print('assembler') train = assembler.transform(self.data) train = train.drop(*cols) gbt = GBTRegressor(labelCol="speed_overground", featuresCol="features", predictionCol='predictions') print('Train model. This also runs the indexers.') model = gbt.fit(train) # Save and load model model.write().overwrite().save('myGBTRegressor_nan') predictions = model.transform(train) evaluator = RegressionEvaluator(labelCol="speed_overground", predictionCol="predictions", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
def run_GBT_Regression(dependent_variable, data_vector): train_df, test_df = create_train_test_df(dependent_variable, data_vector) regressor = GBTRegressor(featuresCol='features', labelCol=dependent_variable, maxIter=10) regressor_model = regressor.fit(train_df) print('feature importance is {}'.format( regressor_model.featureImportances)) # Make predictions. predictions = regressor_model.transform(test_df) # Select example rows to display. predictions.select("prediction", dependent_variable, "features").show() # Select (prediction, true label) and compute test error r2_evaluator = RegressionEvaluator(labelCol=dependent_variable, predictionCol="prediction", metricName="r2") rmse_evaluator = RegressionEvaluator(labelCol=dependent_variable, predictionCol="prediction", metricName="rmse") print("R Squared (R2) on test data = %g" % r2_evaluator.evaluate(predictions)) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse_evaluator.evaluate(predictions)) return [ r2_evaluator.evaluate(predictions), rmse_evaluator.evaluate(predictions), regressor_model.featureImportances.toArray().tolist() ]
def gradientBoostRegressorModel(self): gradientBoostRegressorModelFit = \ GBTRegressor(labelCol=self.labelColm, featuresCol=self.featuresColm, predictionCol=self.modelSheetName) regressor = gradientBoostRegressorModelFit.fit(self.trainData) # predictionData = regressor.transform(self.testData) regressionStat = self.randomGradientRegressionModelEvaluation(regressor=regressor) # persisting model modelName = "gradientBoostModel" extention = ".parquet" modelStorageLocation = self.locationAddress + self.userId.upper() + modelName.upper() + extention regressor.write().overwrite().save(modelStorageLocation) regressionStat["modelPersistLocation"] = {"modelName": modelName, "modelStorageLocation": modelStorageLocation} return regressionStat # reference for the future development. """
def build_gradient_boosted_tree_regression(observation_df, feature_columns): # Create new column with all of the features vector_observation_df = create_feature_column(observation_df, feature_columns, ['features', 'duration_sec']) train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3]) model = GBTRegressor(featuresCol="features", labelCol="duration_sec", maxIter=15) model = model.fit(train_df) test_predictions = model.transform(test_df) test_predictions.select("prediction", "duration_sec", "features").show(5) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="rmse") print("RMSE on test data = %g" % evaluator.evaluate(test_predictions)) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="r2") print("R2 on test data = %g" % evaluator.evaluate(test_predictions)) return model
def model_dev_gbm(df_train, df_test, max_iter, max_bins, max_depth): gbm_start_time = time() # Create an Initial Model Instance mod_gbm = GBTRegressor(labelCol='label', featuresCol='features', featureSubsetStrategy='all', lossType='squared', maxIter=max_iter, maxBins=max_bins, maxDepth=max_depth) # Training The Model gbm_final_model = mod_gbm.fit(df_train) # Scoring The Model On Test Sample gbm_transformed = gbm_final_model.transform(df_test) gbm_test_results = gbm_transformed.select(['prediction', 'label']) # Collecting The Model Statistics gbm_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label") gbm_r2 = round( gbm_evaluator.evaluate(gbm_test_results, {gbm_evaluator.metricName: "r2"}), 3) gbm_mse = round( gbm_evaluator.evaluate(gbm_test_results, {gbm_evaluator.metricName: "mse"}), 3) gbm_rmse = round( gbm_evaluator.evaluate(gbm_test_results, {gbm_evaluator.metricName: "rmse"}), 3) gbm_mae = round( gbm_evaluator.evaluate(gbm_test_results, {gbm_evaluator.metricName: "mae"}), 3) # Printing The Model Statitics print("\n++++++ Printing Gradient Boosting Model Accuracy ++++++\n") print("R Square: " + str(gbm_r2 * 100) + "%") print("Mean Squared Error: " + str(gbm_mse)) print("Root Mean Squared Error: " + str(gbm_rmse)) print("Mean Absolute Error: " + str(gbm_mae)) gbm_end_time = time() gbm_elapsed_time = (gbm_end_time - gbm_start_time) / 60 gbm_model_stat = pd.DataFrame({ "Model Name": ["Gradient Boosting"], "R Square": gbm_r2, "Mean Squared Error": gbm_mse, "Root Mean Squared Error": gbm_rmse, "Mean Absolute Error": gbm_mae, "Time (Min.)": round(gbm_elapsed_time, 3) }) gbm_output = (gbm_final_model, gbm_model_stat) return (gbm_output)
def GBT(trainingData, testData): """ Gradient Boosted Tree Regression Model :param trainingData: :param testData: :return: Trained model, predictions """ gbt = GBTRegressor(maxIter=100, maxDepth=6, seed=42) model = gbt.fit(trainingData) predictions = model.transform(testData) return model, predictions
def traingbt(datafrom='json', business_path='', user_path='', star_path=''): gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) if datafrom == 'json': businessDF, userDF, starDF = loadDataJson(business_path=business_path, user_path=user_path, star_path=star_path) elif datafrom == 'mongodb': businessDF, userDF, starDF = loadDataMongo() data = transData4GBT(businessDF, userDF, starDF) model = gbt.fit(data) return model
def gradient_boosted_tree_regression(train_data, test_data): gbt = GBTRegressor(featuresCol='features', labelCol='MEDV', maxIter=10) gbt_model = gbt.fit(train_data) gbt_predictions = gbt_model.transform(test_data) print(gbt_predictions.select('prediction', 'MEDV', 'features').show(5)) gbt_evaluator = RegressionEvaluator( labelCol='MEDV', predictionCol='prediction', metricName='rmse', ) rmse = gbt_evaluator.evaluate(gbt_predictions) print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
def testRegression(train, test): # Train a GradientBoostedTrees model. rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def testRegression(train, test): # Train a GradientBoostedTrees model. rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def params_of_GBTRegressor(): """ def param_grid_gbtr(esti: Estimator) -> list: return ParamGridBuilder() \ .addGrid(esti.maxBins, [16, 32, 64]) \ .addGrid(esti.maxDepth, [3, 5, 10]) \ .build() { Param(parent='GBTRegressor_a1b083f1027e', name='maxBins', doc='Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.'): 32, (esti.maxBins, [01, 32]) Param(parent='GBTRegressor_a1b083f1027e', name='lossType', doc='Loss function which GBT tries to minimize (case-insensitive). Supported options: squared, absolute'): 'squared', (esti.lossType, ['absolute', 'squared']) Param(parent='GBTRegressor_a1b083f1027e', name='featureSubsetStrategy', (esti.featureSubsetStrategy ,['auto', 'all']) doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), 'sqrt' for classification and 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'"): 'all', Param(parent='GBTRegressor_a1b083f1027e', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 1, Param(parent='GBTRegressor_a1b083f1027e', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0, Param(parent='GBTRegressor_a1b083f1027e', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 5, (esti.maxDepth, [3, 5, 8]) Param(parent='GBTRegressor_a1b083f1027e', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10, Param(parent='GBTRegressor_a1b083f1027e', name='cacheNodeIds', d oc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. C aching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False} Param(parent='GBTRegressor_a1b083f1027e', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.1, Param(parent='GBTRegressor_a1b083f1027e', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0, Param(parent='GBTRegressor_a1b083f1027e', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256, Param(parent='GBTRegressor_a1b083f1027e', name='maxIter', doc='max number of iterations (>= 0).'): 20, """ SparkSession.builder \ .appName("tryout") \ .getOrCreate() m = GBTRegressor() pm = m.extractParamMap() pprint(pm)
def train_boosted_regression(self, depth=2, n_trees=50, learning_rate=.01, max_cats=6): ''' train dataset on boosted decision trees -------- Parameters depth: int - max_allowable depth of decision tree leafs n_trees: int - max number of iterations learning_rate: int - rate which the model fits -------- ''' featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=max_cats).fit(self.train) gbr = GBTRegressor(labelCol='label', featuresCol="features", maxDepth=depth, maxIter=n_trees, stepSize=learning_rate, maxMemoryInMB=2000) pipeline = Pipeline(stages=[featureIndexer, gbr]) # Train model. This also runs the indexer. self.model = pipeline.fit(self.train)
def score_gbt(split_input_train_df, split_input_validation_df, model_evaluator): global model_rmse, model_dict, model_count print( "###################### Gradient Boosted Tree Regression #########################" ) gbt_regressor = GBTRegressor(featuresCol='features', labelCol='total_delivery_duration') print("CrossValidation...") gbt_paramGrid = ParamGridBuilder()\ .addGrid(gbt_regressor.maxIter, [5, 10])\ .addGrid(gbt_regressor.maxBins, [5700, 6000])\ .addGrid(gbt_regressor.maxMemoryInMB, [256, 512])\ .addGrid(gbt_regressor.subsamplingRate, [0.1, 1.0])\ .build() gbt_cross_val = CrossValidator(estimator=gbt_regressor, estimatorParamMaps=gbt_paramGrid, evaluator=model_evaluator, numFolds=3) print("Done") print("Fitting training data...") gbt_cv_model = gbt_cross_val.fit(split_input_train_df) print("Done") print("Evaluating on validation data...") rmse = model_evaluator.evaluate( gbt_cv_model.transform(split_input_validation_df)) model_rmse.append(rmse) model_count += 1 model_dict[model_count] = {} model_dict[model_count]["GBT"] = gbt_cv_model print("RMSE on validation data: %f" % rmse)
def get_best_weather_model(data): train, test = data.randomSplit([0.75, 0.25]) train = train.cache() test = test.cache() estimator_gridbuilders = [ estimator_gridbuilder(RandomForestRegressor(), dict(maxDepth=[5], maxBins=[5], numTrees=[20])), estimator_gridbuilder(GBTRegressor(maxIter=100), dict()) ] metricName = 'r2' tvs_list = make_weather_trainers( .2, # fraction of data for training estimator_gridbuilders, metricName) ev = tvs_list[0].getEvaluator() scorescale = 1 if ev.isLargerBetter() else -1 model_name_scores = [] # print(list(tvs_list).count()) for tvs in tvs_list: model = tvs.fit(train) test_pred = model.transform(test) score = ev.evaluate(test_pred) * scorescale model_name_scores.append( (model, get_estimator_name(tvs.getEstimator()), score)) best_model, best_name, best_score = max(model_name_scores, key=lambda triplet: triplet[2]) print("Best model is %s with validation data %s score %f" % (best_name, ev.getMetricName(), best_score * scorescale)) return best_model
def train_model(model_file, inputs): # get the data train_tmax = spark.read.csv(inputs, schema=tmax_schema) train, validation = train_tmax.randomSplit([0.75, 0.25], seed=110) #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax FROM __THIS__" query = """SELECT today.station, dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" #weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy'], outputCol="features") weather_assembler = VectorAssembler(inputCols=['latitude','longitude','elevation', 'doy', 'yesterday_tmax'], outputCol="features") regressor = GBTRegressor(maxIter=50,maxDepth=5,featuresCol="features",labelCol="tmax") transquery = SQLTransformer(statement=query) pipeline = Pipeline(stages=[transquery,weather_assembler,regressor]) model = pipeline.fit(train) model.write().overwrite().save(model_file) # use the model to make predictions predictions = model.transform(validation) #predictions.show() # evaluate the predictions r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('r2 =', r2) print('rmse =', rmse)
def train_gbt(train_data, test_data, label, file_to_save): route_arity = train_data.select('route').distinct().count() duration_gbt = GBTRegressor(labelCol=label, featuresCol="features") paramGrid = (ParamGridBuilder().addGrid(duration_gbt.maxDepth, [2, 4, 6]).addGrid( duration_gbt.maxBins, [300]).build()) crossval = CrossValidator(estimator=duration_gbt, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol=label), numFolds=5) cvModel = crossval.fit(train_data) bestModel = cvModel.bestModel maxDepth = bestModel._java_obj.getMaxDepth() maxBins = bestModel._java_obj.getMaxBins() gbt_params_folder_path = params_folder_path + 'GBT/' if not os.path.exists(gbt_params_folder_path): os.makedirs(gbt_params_folder_path) with open(gbt_params_folder_path + label + 'Params.csv', 'wb') as file: file.write("param, value" + '\n') file.write("maxDepth, " + str(maxDepth) + '\n') file.write("maxBins, " + str(maxBins) + '\n') save_test_info(cvModel.bestModel, test_data, label + "-gbt", file_to_save) return cvModel
def main(inputs, model_file): data = spark.read.csv(inputs, schema=schema()) train, validation = data.randomSplit([0.75, 0.25], seed=42) sql_transformer1 = SQLTransformer(statement=yes_tmax()) sql_transformer2 = SQLTransformer(statement=ret_query()) assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax' ], outputCol='features') regressor = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline(stages=[ sql_transformer1, sql_transformer2, assemble_features, regressor ]) model = pipeline.fit(train) predictions = model.transform(validation) model.write().overwrite().save(model_file) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print("R-squared value : " + str(r2)) print("RMSE value : " + str(rmse))
def main(inputs, model_file): sensor_data_df = spark.read.format("org.apache.spark.sql.cassandra").options(table=sensor_data_table, keyspace=keyspace).load() # creating a ML pipeline sensor_data_df = sensor_data_df.select(sensor_data_df['datetime'], sensor_data_df['latitude'], sensor_data_df['longitude'], sensor_data_df['message_code_id'], sensor_data_df['sensor_reading'], sensor_data_df['sensor_name']).orderBy(sensor_data_df['datetime'].asc()) train_set, validation_set = sensor_data_df.randomSplit([0.75, 0.25]) train_set.catch() validation_set.catch() sql_transformer_statement = "SELECT latitude, longitude, sensor_name, sensor_reading, message_code_id" \ "FROM __THIS__" sql_transformer = SQLTransformer(statement=sql_transformer_statement) assemble_features = VectorAssembler(inputCols=['latitude', 'longitude', 'sensor_name', 'sensor_reading'] , outputCol= 'features') classifier = GBTRegressor(featuresCol='features', labelCol='message_code_id') pipeline = Pipeline(stages=[sql_transformer, assemble_features, classifier]) model = pipeline.fit(train_set) predictions = model.tranform(validation_set) predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='r2') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='message_code_id', metricName='rmse') r2_score = r2_evaluator.evaluate(predictions) rmse_score = rmse_evaluator.evaluate(predictions) print('r2 validation score : ', r2_score) print('rmse validation score: ', rmse_score)
def train_model(): print_title("download winequality-red data!") download_dataset() print_title("load data!") # load iris.csv into Spark dataframe print("First 10 rows of Iris dataset:") lines = pd.read_csv("/tmp/winequality-red.csv") lines_df = spark.createDataFrame(lines) # convert features assembler = pyspark.ml.feature.VectorAssembler(inputCols=["fixed acidity", "volatile acidity","citric acid","residual sugar", "chlorides","free sulfur dioxide","total sulfur dioxide","density", "pH","sulphates","alcoho"], outputCol='features') # convert text labels into indices label_indexer = pyspark.ml.feature.StringIndexer(inputCol='quality', outputCol='label').fit(lines_df) # train gbt = GBTRegressor(featuresCol="features", maxIter=10) label_converter = pyspark.ml.feature.IndexToString(inputCol='prediction', outputCol='predictionClass', labels=label_indexer.labels) pipeline = Pipeline(stages=[assembler, label_indexer, gbt, label_converter]) # fit the pipeline to training documents. model_local_path = os.path.join(LOCAL_MODEL_PATH, MODEL_NAME) model = pipeline.fit(lines_df) # save model model.save(model_local_path) metric_dict = {} calculate_metric_value(model, lines_df)
def main(inputs,output): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__') sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station') assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features') gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax') pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt]) weather_model = pipeline.fit(train) predictions = weather_model.transform(validation) #predictions.show() evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse') score = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % score) weather_model.write().overwrite().save(output)
def pick_algorithm(self, algorithm, features_col, label_col): """.""" if algorithm == 'linearregression': return LinearRegression(featuresCol=features_col, labelCol=label_col) elif algorithm == 'gbtregressor': return GBTRegressor(featuresCol=features_col, labelCol=label_col)
def preprocessing(self): model = GBTRegressor(labelCol="bicycle_rentals") cols = [ "part_time", "holiday", "week_days", "weather_description_mf", "month" ] imputer = Imputer(inputCols=["humidity", "pressure"], outputCols=["humidity_input", "pressure_input"]) indexers = [ StringIndexer(inputCol=col, outputCol="{0}_indexed".format(col)) for col in cols ] assembler = VectorAssembler(inputCols=[ "part_time_indexed", "holiday_indexed", "month_indexed", "week_days_indexed", "weather_description_mf_indexed", "humidity_input", "pressure_input", "temperature", "wind_speed", "from_station_id", "mean_dpcapacity_start", "mean_dpcapacity_end", "sum_subscriber", "sum_customer" ], outputCol="features") pipeline = Pipeline(stages=[imputer] + indexers + [assembler] + [model]) return pipeline
def main(): data = spark.range(100000) data = data.select( (functions.rand()*100).alias('length'), (functions.rand()*100).alias('width'), (functions.rand()*100).alias('height'), ) data = data.withColumn('volume', data['length']*data['width']*data['height']) training, validation = data.randomSplit([0.75, 0.25], seed=42) assemble_features = VectorAssembler( inputCols=['length', 'width', 'height'], outputCol='features') classifier = GBTRegressor( featuresCol='features', labelCol='volume') pipeline = Pipeline(stages=[assemble_features, classifier]) model = pipeline.fit(training) predictions = model.transform(validation) predictions.show() r2_evaluator = RegressionEvaluator( predictionCol='prediction', labelCol='volume', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print(r2)
def Distr_GBTRegressor(xy_train, xy_test): gr = GBTRegressor(minInstancesPerNode=20, maxDepth=25) evalu = RegressionEvaluator() grid_1 = ParamGridBuilder()\ .addGrid(gr.maxIter, [100])\ .addGrid(gr.subsamplingRate, [0.5,0.8,1.0])\ .build() cv_1 = CrossValidator(estimator=gr, estimatorParamMaps=grid_1, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel_1 = cv_1.fit(xy_train) print "Grid scores: " best_params_1 = Get_best_params(cvModel_1, 'reg')['subsamplingRate'] grid = ParamGridBuilder()\ .addGrid(gr.maxIter, [110,120])\ .addGrid(gr.subsamplingRate, [best_params_1,])\ .build() cv = CrossValidator(estimator=gr, estimatorParamMaps=grid, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel = cv.fit(xy_train) best_params = Get_best_params(cvModel, 'reg') print "Best parameters set found: %s" % best_params return cvModel.bestModel
def train(df: DataFrame): def astraining(row: Row) -> Row: df = row.asDict() del df['Sales_Pred'] del df['sales'] sales = row.asDict()['sales'] return Row(label=sales, features=list(df.values())) t3 = train.rdd \ .filter(lambda r: r["sales"] is not None) \ .map(astraining) gbt = GBTRegressor(maxIter=10) df = spark.createDataFrame(t3) df.show() gbt.fit(df) print("----------- after fit ------------")
def model_define(self): """Returns a model with the hyperparameters inputted in :func: `get_parameters` Returns: (pyspark.ml.regression.GBTRegressor) Gradient Boosting Tree Regression model """ return GBTRegressor()
def _get_xgboost_regressor_model(col, train): ''' Gradient Boosted Tree Regressor Model is created for predicting Missing Values ''' print( 'Using Gradient Boosted Regressor Module to predict Missing Values ...' ) reg_model = GBTRegressor(labelCol=col) #params = ParamGridBuilder().addGrid(reg_model.maxDepth, [5, 10, 20]).\ # addGrid(reg_model.minInfoGain, [0.0, 0.01, 1.0]).\ # addGrid(reg_model.maxBins, [32, 20, 50, 100, 300]).build() #cv = CrossValidator(estimator=reg_model, # estimatorParamMaps=params, # evaluator=RegressionEvaluator(labelCol=col), # numFolds=10) reg_model = reg_model.fit(train) return reg_model
def estimators(config): # All models to choose amongst for simple regression/classification model_type = config['base']['model_type'] model = config['base']['model'] if model == 'rf': if model_type == 'classification': glm = RandomForestClassifier( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], numTrees = config['model']['numTrees'], maxDepth = config['model']['maxDepth'] ) elif model_type == 'regression': glm = RandomForestRegressor( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], numTrees = config['model']['numTrees'], maxDepth = config['model']['maxDepth'] ) if model == 'gbm': if model_type == 'classification': glm = GBTClassifier( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], lossType = config['model']['lossType'], maxDepth = config['model']['maxDepth'], stepSize = config['model']['stepSize'] ) elif model_type == 'regression': glm = GBTRegressor( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], lossType = config['model']['lossType'], maxDepth = config['model']['maxDepth'], stepSize = config['model']['stepSize'] ) if model == 'logistic': glm = LogisticRegression( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], threshold = config['model']['threshold'], regParam = config['model']['regParam'], elasticNetParam = config['model']['elasticNetParam'] ) if model == 'linear': glm = LinearRegression( featuresCol = config['base']['featuresCol'], labelCol = config['base']['labelCol'], predictionCol = config['base']['predictionCol'], regParam = config['model']['regParam'], elasticNetParam = config['model']['elasticNetParam'] ) return glm
# COMMAND ---------- dtrModel = dtr.fit(irisPetal) dtrPredictions = dtrModel.transform(irisPetal) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'}) print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'}) # COMMAND ---------- # MAGIC %md # MAGIC Let's also build a gradient boosted tree. # COMMAND ---------- from pyspark.ml.regression import GBTRegressor gbt = GBTRegressor().setLabelCol('petalWidth') print gbt.explainParams() # COMMAND ---------- gbtModel = gbt.fit(irisPetal) gbtPredictions = gbtModel.transform(irisPetal) print regEval.evaluate(gbtPredictions, {regEval.metricName: 'r2'}) print regEval.evaluate(gbtPredictions, {regEval.metricName: 'rmse'}) # COMMAND ---------- # MAGIC %md # MAGIC We should really test our gradient boosted tree out-of-sample as it is easy to overfit with a GBT model. # COMMAND ----------
# Select example rows to display. predictions.select("prediction", "label").show(30,False) evaluator = RegressionEvaluator(metricName="rmse") # rmse (default)|mse|r2|mae RMSE = evaluator.evaluate(predictions) print 'RMSE: ' + str(RMSE) ####################################################################################### # # Modeling - Gradient Boosting (Regression) # ####################################################################################### gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) #gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbtmodel = gbt.fit(training) # Make predictions. predictions = gbtmodel.transform(testing) # Select example rows to display. predictions.select("prediction", "label").show(30,False) evaluator = RegressionEvaluator(metricName="rmse") # rmse (default)|mse|r2|mae RMSE = evaluator.evaluate(predictions) print 'RMSE: ' + str(RMSE)
stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index") encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec") stages += [stringIndexer, encoder] #encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend'] encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type'] for eCol in encColumns: encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec") stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import GBTRegressor gbt = GBTRegressor(featuresCol = 'features', labelCol = 'total_amount', maxIter=10) gbt_model = gbt.fit(dataset) gbt_model.write().overwrite().save("./nyc-01020304-18-6vm-gbt-model")
# COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor() print rf.explainParams() rfModel = rf.fit(df) gbt = GBTRegressor() print gbt.explainParams() gbtModel = gbt.fit(df) # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml import Pipeline from pyspark.ml.tuning import CrossValidator, ParamGridBuilder glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity") pipeline = Pipeline().setStages([glr]) params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build() evaluator = RegressionEvaluator()\ .setMetricName("rmse")\
print datatype var_target = 'rating' var_features = [col for col in enriched1.columns if col not in ['userId','movieId','rating','timestamp','title','tag']] # Generate Features Vector and Label va = VectorAssembler(inputCols=var_features, outputCol="features") modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features') training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90]) print '[ INFO ] Training: ' + str(training.count()) + ' records' print '[ INFO ] Testing: ' + str(training.count()) + ' records' gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbmodel = gb.fit(training) #gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model') predictions = gbmodel.transform(testing) print '[ INFO ] Printing predictions vs label...' predictions.show(10,False).select('prediction',var_target) evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction") print '[ INFO ] Model Fit (RMSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})) #print '[ INFO ] Model Fit (MSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"})) #print '[ INFO ] Model Fit (R2): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"})) total_runtime_seconds = (datetime.datetime.now() - start_time).seconds