def Prediction(self, modelType): data_point = self.Features if modelType == 'RF': model = RandomForestModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'GBDT': model = GradientBoostedTreesModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRsgd': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRlbfgs': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'SVM': model = SVMModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) else: pass
def getPredictions(pfDF, decodePlayerIds, rddDir, sc, pitching_hitter, predictors): all_fd_points_df = None #playerIds = pfDF.map(lambda x: str(x.player_id) + '_' + x.game_id) playerIds = pfDF.map(lambda x: x.player_id).map(lambda x: decodePlayerIds[x]) print "playerIds=", playerIds.collect() for predictor in predictors: print "predictor=", predictor #modelFilename=rddDir + "pitching_" + predictor + "_model.RandomForest" modelFilename = rddDir + pitching_hitter + "_" + predictor + "_model.RandomForest" model = GradientBoostedTreesModel.load(sc, modelFilename) data = toLabeledPoint(pfDF, predictor) #pitcherFeatures = pfDF.collect() predictions = model.predict(data.map(lambda x: x.features)) print "p predictions=", predictions print "p predictions take=", predictions.take(16) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache() print "predictions=", labelsAndPredictions.take(16) #pitcherPredictions = pfDF.map(lambda x: x.asDict()['player_id']).map(lambda x: decodePlayerIds[x]).zip(predictions).cache() #print "pitcherPredictions=", pitcherPredictions.take(16) if all_fd_points_df is None: #all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache() print "FIRST: # predictions=", predictions.count() print " # playerIds=", playerIds.count() all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]) print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema() print "# all_fd_points_df", all_fd_points_df.count() print "first all_fd_points_df", all_fd_points_df.take(5) print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() else: print "ELSE: # predictions=", predictions.count() print " # playerIds=", playerIds.count() curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]) print "all_fd_points_df", all_fd_points_df.printSchema() print "PRE all_fd_points_df", all_fd_points_df.take(16) print "curr_fd_points_df", curr_fd_points_df.printSchema() print "# curr_fd_points_df", curr_fd_points_df.count() #print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count() print "first curr", curr_fd_points_df.take(16) all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id) print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema() #print "all debugstring", all_fd_points_df.rdd.toDebugString() print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() # def sumFD(r): # x = r.asDict() # fd_sum = 0.0 # for k,v in x.iteritems(): # if k not in ['fd_points', 'player_id']: # fd_sum += v # x['fd_sum'] = fd_sum # x['fd_points_orig'] = x['fd_points'] # x['fd_points'] = fd_sum # print "sumx=", x # return Row(**x) predictions = all_fd_points_df.map(sumFD) print pitching_hitter + " predictions=", predictions.take(50) return predictions
def getFDPointsPredictions(pfDF, decodePlayerIds, rddDir, sc, pitching_hitter, predictors): all_fd_points_df = None playerIds = pfDF.map(lambda x: x.player_id).map(lambda x: decodePlayerIds[x]) print "playerIds=", playerIds.collect() predictor = 'fd_points' print "predictor=", predictor modelFilename = rddDir + pitching_hitter + "_" + predictor + "_model.RandomForest" model = GradientBoostedTreesModel.load(sc, modelFilename) data = toLabeledPoint(pfDF, predictor) predictions = model.predict(data.map(lambda x: x.features)) print "p predictions=", predictions print "p predictions take=", predictions.take(16) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache() print "labelsAndPredictions=", labelsAndPredictions.take(16) print " # playerIds=", playerIds.count() all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]) print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema() print "# all_fd_points_df", all_fd_points_df.count() print "first all_fd_points_df", all_fd_points_df.take(5) print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() print pitching_hitter + " predictions=", all_fd_points_df.take(50) return all_fd_points_df
import flask from flask import Flask, request, url_for, Response from sklearn.externals import joblib from pyspark.mllib.tree import GradientBoostedTreesModel from pyspark.mllib.linalg import SparseVector from pyspark import SparkContext,SparkConf import json app = Flask(__name__) # 加载模型 conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) model =GradientBoostedTreesModel.load(sc,'./sellModel') @app.route("/", methods=["GET"]) def index(): with app.test_request_context(): # 生成每个函数监听的url以及该url的参数 result = {"gbdt": {"url": url_for("gbdt"), "params": ["vector"]}} result_body = flask.json.dumps(result) return Response(result_body, mimetype="application/json") @app.route("/ml/gbdt", methods=["GET"]) def gbdt(): request_args = request.args # 如果没有传入参数,返回提示信息
# %% svm_predictions = test.map( lambda line: (line[0], line[1], float(svm_model.predict(line[3])))) svm_predictions.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv") # %% [markdown] # 日期:2020-12-20 14:18:59 排名: 无 # score:0.5156678 # %% [markdown] # ## Gradient Boosted Trees # %% from pyspark.mllib.tree import GradientBoostedTreesModel GBDT_model = GradientBoostedTreesModel.load( sc, "hdfs://node1:9000/user/root/exp4/models/myGradientBoostingClassificationModel" ) # %% predictions = GBDT_model.predict(test.map(lambda x: x[3])) GBDT_predictions = test.map(lambda lp: (lp[0], lp[1])).zip(predictions).map( lambda lp: (lp[0][0], lp[0][1], lp[1])) GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv") # GBDT_predictions = test.map(lambda line: (line[0],line[1],float(GBDT_model.predict(line[3])))) # GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv") # %% [markdown] # 日期:2020-12-20 14:51:00 排名: 无 # score:0.5000562
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
# ONE-HOT ENCODING OF CATEGORICAL TEXT FEATURES FOR INPUT INTO TREE-BASED MODELS def parseRowIndexingRegression(line): features = np.array([line.paymentIndex, line.vendorIndex, line.rateIndex, line.TrafficTimeBinsIndex, line.pickup_hour, line.weekday, line.passenger_count, line.trip_time_in_secs, line.trip_distance, line.fare_amount]) return features # FOR REGRESSION CLASSIFICATION TRAINING AND TESTING indexedTESTreg = encodedFinal.map(parseRowIndexingRegression) # CACHE RDDS IN MEMORY indexedTESTreg.cache(); from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel #################################################################### ## REGRESSION: LOAD SAVED MODEL, SCORE AND SAVE RESULTS BACK TO BLOB savedModel = GradientBoostedTreesModel.load(sc, BoostedTreeRegressionFileLoc) predictions = savedModel.predict(indexedTESTreg) # SAVE RESULTS datestamp = unicode(datetime.datetime.now()).replace(' ','').replace(':','_'); btregressionfilename = "GradientBoostingTreeRegression_" + datestamp + ".txt"; dirfilename = scoredResultDir + btregressionfilename; predictions.saveAsTextFile(dirfilename) # ## Cleanup objects from memory, print final time, and print scored output file locations # #### Unpersist objects cached in memory taxi_df_test_cleaned.unpersist() indexedTESTreg.unpersist();
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesClassificationExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingClassificationModel") sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingClassificationModel") # $example off$
encodedPitchingFeatures = sqlContext.parquetFile(rddDir + "/pitching_features.enc.parquet") encodedPitchingFeatures.registerTempTable("pfe") print "pfe=", encodedPitchingFeatures.take(22) pfDF = sqlContext.sql("""select distinct pfe.* from fd_pitchers, pfe where fd_pitchers.player_id = pfe.player_id and fd_pitchers.game_date = pfe.game_date""") #TODO - why are we getting duplicate records? print "count pfdf=", pfDF.count() print "pfDF=", pfDF.collect() print "pfDF vals=", pfDF.select('player_id', 'game_id').collect() #model = RandomForestModel.load(sc, rddDir + "batting_model.RandomForest") #model = GradientBoostedTreesModel.load(sc, rddDir + "batting_model.RandomForest") model = GradientBoostedTreesModel.load(sc, rddDir + "batting_fd_points_model.RandomForest") playerAndPredictions = TrainModel.predictHitters(hfDF, decodedHitterPlayerIds, rddDir, sc) hitterFeatures = hfDF.collect() # global predictField # predictField = 'fd_points' # data = hfDF.map(toLabeledPoint) # predictions = model.predict(data.map(lambda x: x.features)) # print "predictions=", predictions # print "predictions take=", predictions.take(2) # labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions).cache() # print "predictions=", labelsAndPredictions.take(2) # playerAndPredictions = hfDF.map(lambda x: x.asDict()['player_id']).map(lambda x: decodePlayerIds[x]).zip(predictions).cache() # print "playerAndPredictions=", playerAndPredictions.take(2) #model = RandomForestModel.load(sc, rddDir + "pitching_model.RandomForest") #model = GradientBoostedTreesModel.load(sc, rddDir + "pitching_model.RandomForest")
conf = SparkConf().setAppName( 'Gradient Boosted Tree Classification').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse data file data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') # split the data into training and test trainingData, test = data.randomSplit([0.7, 0.3]) # train a gradient boost tree model model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) labelAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelAndPredictions.filter(lambda (v, p): v != p).count() / float( test.count()) print('test error :' + str(testErr)) print('learned classification GBT model :') print(model.toDebugString) # save and load model.save(sc, '../model/myGradientBoostingClassificationModel') sameModel = GradientBoostedTreesModel.load( sc, '../model/myGradientBoostingClassificationModel') sc.stop()
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTreesModel from pyspark.mllib.linalg import SparseVector from pyspark import SparkContext, SparkConf from pyspark.ml.util import MLReader from random import random conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf=conf) #loader = MLReader() #model = loader.load('./model') model = GradientBoostedTreesModel.load(sc, './gbdymodelonlionev1') rdd = sc.parallelize([[i for i in range(16)], [i * 2000 for i in range(16)], [random() for i in range(16)]]) print(model.predict(rdd).collect()) #model.predct(sparkpdfrompandas.rdd.map(lambda x:list(x))) print(model) #print(model.predict(SparseVector(2, {0: 1.0})))
ascontext.setSparkOutputSchema(output_schema) sys.exit(0) modelpath = ascontext.getModelContentToPath("model") model_metadata = json.loads(ascontext.getModelContentToString("model.metadata")) model_type=model_metadata["model_type"] # create a DataModelTools to handle data model and data conversions datamodel = model_metadata["datamodel"] dmt = DataModelTools(datamodel) predictors = model_metadata["predictors"] DataModelTools.checkPredictors(datamodel,predictors,df) from pyspark.mllib.tree import GradientBoostedTreesModel model = GradientBoostedTreesModel.load(sc, modelpath) # to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this dv = dmt.extractDenseVector(df,predictors).map(lambda x:x[1]) # scoring generates an RDD of predictions (but not the original features) predictions = model.predict(dv) # now we need to zip together the original rows from the DataFrame and the RDD of predictions # we end up with an RDD containing the list of values from the original dataframe plus the predicted class, converted from the encoded number to the original string def rowToList(row): result = [] for idx in range(0, len(row)): result.append(row[idx]) return result
#df = spark.read.option("header","false").csv("hdfs://student61:9000/wiki/final_Item.csv") df = spark.read.option("header", "false").option( "inferSchema", "true").csv("hdfs://student61:9000/wiki/final_Item.csv") #df.cast(DoubleType) print('parker!!!', df) print(df.first()) df.show() test_data = df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[:]))) print('testData!!!', test_data) # load model start_time = time() #model = GradientBoostedTreesModel.load(spark.sparkContext,"hdfs://student61:9000/wiki/GBDT_model") # model = RandomForestModel.load(spark.sparkContext,"hdfs://student61:9000/wiki/RF_model") model = GradientBoostedTreesModel.load( spark.sparkContext, "hdfs://student61:9000/wiki/GBDT_regression_model") #print(model.toDebugString()) end_time = time() elapsed_time = end_time - start_time print("---------------------------------------------------") print("Time to load model: %.3f seconds" % elapsed_time) print("---------------------------------------------------") # make predictions predictions = model.predict(test_data.map(lambda x: x.features)) end_time = time() elapsed_time = end_time - start_time print("---------------------------------------------------") print("Time from load model to predictions: %.3f seconds" % elapsed_time) print("---------------------------------------------------") print( '--------------------------------------------------------------------')
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Gradient Boosted Tree Regression').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse data file data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') # split the data into training and test trainingData, test = data.randomSplit([0.7, 0.3]) # train a gradient boosted tree model model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x : x.features)) labelsAndPredictions = test.map(lambda lp : lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)**2).sum()/float(test.count()) print('test mean squared error :' + str(testMSE)) print('learned regression GBT model :') print(model.toDebugString) # save and load model.save(sc, '../model/myGradientBoostingRegressionModel') sameModel = GradientBoostedTreesModel.load(sc, '../model/myGradientBoostingRegressionModel') sc.stop()
print(str(datetime.now())+": Initiating SparkContext...") # Initiate Spark Context try: sc = SparkContext("local","dengue") sqlContext = SQLContext(sc) except: print(str(datetime.now())+": Failed to initiate Spark Context!") print(str(datetime.now())+": Quitting...") sys.exit() print(str(datetime.now())+": Loading trained ML model from HDFS...") # Load trained model from HDFS try: ml_model = GradientBoostedTreesModel.load(sc,"hdfs:///user/w205/dengue_prediction/ml_model") except: print(str(datetime.now())+": Unable to load trained model from HDFS!") print(str(datetime.now())+": Quitting...") sys.exit() print(str(datetime.now())+": Testing database connection...") try: # Connect to the database conn = psycopg2.connect(database="denguepred", user="******", password="******", host="localhost", port="5432") # Create cursor cur = conn.cursor() # Execute a query just to check that we don't get an exception cur.execute("SELECT * from predictions LIMIT 1;") # Try to fetch the result
line.passenger_count, line.trip_time_in_secs, line.trip_distance, line.fare_amount ]) return features # FOR REGRESSION CLASSIFICATION TRAINING AND TESTING indexedTESTreg = encodedFinal.map(parseRowIndexingRegression) # CACHE RDDS IN MEMORY indexedTESTreg.cache() from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel #################################################################### ## REGRESSION: LOAD SAVED MODEL, SCORE AND SAVE RESULTS BACK TO BLOB savedModel = GradientBoostedTreesModel.load(sc, BoostedTreeRegressionFileLoc) predictions = savedModel.predict(indexedTESTreg) # SAVE RESULTS datestamp = unicode(datetime.datetime.now()).replace(' ', '').replace(':', '_') btregressionfilename = "GradientBoostingTreeRegression_" + datestamp + ".txt" dirfilename = scoredResultDir + btregressionfilename predictions.saveAsTextFile(dirfilename) # ## Cleanup objects from memory, print final time, and print scored output file locations # #### Unpersist objects cached in memory taxi_df_test_cleaned.unpersist() indexedTESTreg.unpersist()
def load(name): model = init({}, {}) model['model'] = GradientBoostedTreesModel.load( model['spark'].sparkContext, MODEL_DIRECTORY + name) return model
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingRegressionModel") sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel") # $example off$
def load(self, path): return GradientBoostedTreesModel.load(sc, path)
if __name__ == "__main__": sc = SparkContext( appName="PythonGradientBoostedTreesClassificationExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingClassificationModel") sameModel = GradientBoostedTreesModel.load( sc, "target/tmp/myGradientBoostingClassificationModel") # $example off$
if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingRegressionModel") sameModel = GradientBoostedTreesModel.load( sc, "target/tmp/myGradientBoostingRegressionModel") # $example off$
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
print(dataPath) data = MLUtils.loadLibSVMFile(sc, dataPath) # 将数据集分割为训练数据集和测试数据集 (trainingData, testData) = data.randomSplit([0.7, 0.3]) print("train data count: " + str(trainingData.count())) print("test data count : " + str(testData.count())) # 训练GBDT分类器 # categoricalFeaturesInfo 为空,表示所有的特征均为连续值 # 实践中使用更多的numIterations model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) # 测试数据集上预测 predictions = model.predict(testData.map(lambda x: x.features)) # 打包真实值与预测值 labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # 统计预测错误的样本的频率 testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) print('GradientBoosted Trees Test Error = %5.3f%%' % (testErr * 100)) print("GradientBoosted Trees Learned classifiction tree model : ") print(model.toDebugString()) # 保存和加载训练好的模型 modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myGradientBoostingClassificationModel" model.save(sc, modelPath) sameModel = GradientBoostedTreesModel.load(sc, modelPath)