def cross_validation_gb(Data_1,Data_2,Data_3,loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train_1 = GradientBoostedTrees.trainRegressor(Data_1.union(Data_2), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features)) labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_3.count()) model_train_2 = GradientBoostedTrees.trainRegressor(Data_2.union(Data_3), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features)) labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2) testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_1.count()) model_train_3 = GradientBoostedTrees.trainRegressor(Data_3.union(Data_1), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features)) labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3) testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_2.count()) return (testMSE_1+testMSE_2+testMSE_3)/3
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(sc.parallelize(data_train.collect(),5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def main(sc, sql_context, is_hive=True): lp_train = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.train") lp_check = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.check") model = GradientBoostedTrees.trainRegressor(lp_train, {}, numIterations=50, maxDepth=10) preds = model.predict(lp_check.map(lambda x: x.features)) labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=False) for each in labels_and_preds.take(100): print each labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=True) for each in labels_and_preds.take(100): print each mse = labels_and_preds.map( lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count() print mse mse = labels_and_preds.map( lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count() print mse
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) except ValueError: self.fail()
def Regression_Model(filename): open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data( filename) output = [] for i in range(1, len(Date)): tmp = LabeledPoint(label=True_price_train[i], features=[close_price_train[i]]) output.append(tmp) output_train_RDD = sc.parallelize(output).cache() lrm = LinearRegressionWithSGD.train(output_train_RDD, step=0.001, iterations=100000) tree = DecisionTree.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=30) forest = RandomForest.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=30) gradient = GradientBoostedTrees.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numIterations=10) print("\n============MODEL Evaluation=============\n") model_name = [ 'LinearRegression', 'DecisionTree', 'RandomForest', 'GradientBoostedTrees' ] es_modelname = ['lrm', 'tree', 'forest', 'gradient'] result = '' x = 0 err = 1000 test_model = 'LinearRegression' #此处更换不同的RDD output_model_RDD = lrm for model in [lrm, tree, forest, gradient]: predictions = model.predict(output_train_RDD.map(lambda x: x.features)) labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip( predictions) MSE = ( labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(output_train_RDD.count()))**0.5 #print ("Predictions: ", valuesAndPreds.take(10)) result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n" if (err > MSE): err = MSE output_model = model es_model = es_modelname[x] x += 1 print(result) print(es_model) return Date, True_price, output_model_RDD, open_price, close_price, es_model
def cross_validation_gb(Data_1, Data_2, Data_3, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train_1 = GradientBoostedTrees.trainRegressor( Data_1.union(Data_2), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features)) labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_3.count()) model_train_2 = GradientBoostedTrees.trainRegressor( Data_2.union(Data_3), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features)) labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2) testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_1.count()) model_train_3 = GradientBoostedTrees.trainRegressor( Data_3.union(Data_1), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features)) labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3) testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(Data_2.count()) return (testMSE_1 + testMSE_2 + testMSE_3) / 3
def testRegression(trainingData, testData, model_path): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3, maxDepth=4) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(testData.count()) print("Test Mean Squared Error = " + str(testMSE)) print("Learned regression GBT model:") print(model.toDebugString()) model.save(sc, model_path)
def validation_gb(trainingData,testData, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions = model_train.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) return testMSE
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def testRegression(trainingData, testData): # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=30, maxDepth=4) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \ / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression ensemble model:') print(model.toDebugString())
def main(): #Reading train and test data trainData = sc.pickleFile(input+'/Train_data.average/part-00000') testData = sc.pickleFile(input+'/Test_data.average/part-00000') parsedData=trainData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedTestData = testData.map(parseInput).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedData,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedTestData.map(lambda x: x.features)) labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip(predictions) validationErr = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedTestData.count()) parsedTestData.unpersist() RMSE=math.sqrt(validationErr) print("Root Mean Squared Error Test= " + str(RMSE))
def validation_gb(trainingData, testData, loss_type, num_iter, maxDepth): # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor( trainingData, categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Evaluate model on test instances and compute test error predictions = model_train.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) return testMSE
def seg_model_gb(train_data, test_data, loss_type, num_iter, maxDepth): removelist_train = set( ['stars', 'business_id', 'bus_id', 'b_id', 'review_id', 'user_id']) newlist_train = [ v for i, v in enumerate(train_data.columns) if v not in removelist_train ] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train = (transformed_train.select( "features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Gradient Boosted Trees regressor model_train = GradientBoostedTrees.trainRegressor( sc.parallelize(data_train.collect(), 5), categoricalFeaturesInfo={}, loss=loss_type, numIterations=num_iter, maxDepth=maxDepth) # Creating a list of features to be used for predictions removelist_final = set( ['business_id', 'bus_id', 'b_id', 'review_id', 'user_id']) newlist_final = [ v for i, v in enumerate(test_data.columns) if v not in removelist_final ] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final, outputCol="features") transformed_final = assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map( lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def main(): records = get_records() first = records.first() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) first_point = data.first() gbt_model = GradientBoostedTrees.trainRegressor(data, categoricalFeaturesInfo={}, numIterations=3) true_vs_predicted_gbt = data.map(lambda p: (p.label, gbt_model.predict(p.features))) predictions = gbt_model.predict(data.map(lambda x: x.features)) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) print "GradientBoosted Trees predictions: " + str( labelsAndPredictions.take(5)) mse = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(data.count()) mae = labelsAndPredictions.map(lambda (v, p): np.abs(v - p)).sum() /\ float(data.count()) rmsle = labelsAndPredictions.map(lambda (v,p) : ((np.log(p + 1) - np.log(v + 1))**2)).sum() /\ float(data.count()) print('Gradient Boosted Trees - Mean Squared Error = ' + str(mse)) print('Gradient Boosted Trees - Mean Absolute Error = ' + str(mae)) print('Gradient Boosted Trees - Mean Root Mean Squared Log Error = ' + str(rmsle))
def labelData(data): return data.map(lambda row: LabeledPoint(row[2], row[3:])) f = open('GradientBoostedTree_regression_evaluation.txt', 'w') training, test = labelData(data).randomSplit([0.8, 0.2]) numTraining = training.count() numTest = test.count() def getPredictionsLabels(model, test): predictions = model.predict(test.map(lambda r: r.features)) return predictions.zip(test.map(lambda r: r.label)) def printMetrics(predictions_and_labels): metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format( metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2)) model = GradientBoostedTrees.trainRegressor(training, categoricalFeaturesInfo={}) f.write(model.toDebugString()) predictions_and_labels = getPredictionsLabels(model, test) printMetrics(predictions_and_labels) f.close() sc.stop()
def runmodel_spark(spark, train, test, modelname): newtrain = make_dataframe(chromosome, train) data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=30)
print(x) print(y) os.environ["SPARK_HOME"] = "/Users/alexsisu/programs/spark-1.6.0" conf = SparkConf().setAppName("myapp").setMaster("local") sc = SparkContext(conf=conf) input_data = [] for (xx, yy) in zip(x, y): lp = LabeledPoint(xx, [yy]) input_data.append(lp) training_data = sc.parallelize(input_data).cache() test_data_rdd = sc.parallelize(input_data).cache() classificationModel = GradientBoostedTrees.trainRegressor( training_data, categoricalFeaturesInfo={}, numIterations=100, maxDepth=10) result = classificationModel.predict(test_data_rdd.map(lambda x: x.features)) print classificationModel print classificationModel.toDebugString() print "===============================" predicted_data = result.collect() print(predicted_data) zippedResult = test_data_rdd.map(lambda x: x.label).zip(result) metrics = RegressionMetrics(zippedResult) print(metrics.meanAbsoluteError) print(metrics.meanSquaredError) print(metrics.rootMeanSquaredError)
training, test = labelData(data).randomSplit([0.8, 0.2]) numTraining = training.count() numTest = test.count() def getPredictionsLabels(model, test): predictions = model.predict(test.map(lambda r: r.features)) return predictions.zip(test.map(lambda r: r.label)) def printMetrics(predictions_and_labels): metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format( metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2)) timestart = datetime.datetime.now() model = GradientBoostedTrees.trainRegressor(training, categoricalFeaturesInfo = {},\ loss='leastSquaresError', numIterations=10, learningRate=0.1, maxDepth=15, maxBins=16) f.write(model.toDebugString()) predictions_and_labels = getPredictionsLabels(model, test) printMetrics(predictions_and_labels) timeend = datetime.datetime.now() timedelta = round((timeend - timestart).total_seconds(), 2) f.write("Time taken to execute this model is: " + str(timedelta) + " seconds.\n") f.close() sc.stop()
if model_type == "classification": model = GradientBoostedTrees.trainClassifier( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) else: # regression model = GradientBoostedTrees.trainRegressor( lp, categoricalFeaturesInfo=dmt.getCategoricalFeatureInfo(df,predictors), loss=loss_param, numIterations=numIterations_param, learningRate=learningRate_param, maxDepth=maxDepth_param, maxBins=maxBins_param) build_report = mbr.report(lp.count(),lp.getNumPartitions(), predictors,datamodel,target,model_type, settings=[("Algorithm","Gradient Boosted Trees",[("loss",loss_param),("numIterations",numIterations_param),("learningRate",learningRate_param),("maxDepth",maxDepth_param),("maxBins",maxBins_param)])]) print(build_report) model.save(sc, modelpath) model_metadata = { "target":target, "predictors":predictors, "datamodel": datamodel, "model_type":model_type } print(model.toDebugString())
from pyspark import SparkConf, SparkContext SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\") import sys, pickle,math from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('random-forest') sc = SparkContext(conf=conf) input = sys.argv[1] # Load and parse the data def parsePoint(line): return LabeledPoint(float(line[1]), line[0]) train = sc.pickleFile(input+'/bow_train/part-00000') test = sc.pickleFile(input+'/bow_test/part-00000') parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0) parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache() model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1) predictions = model.predict(parsedtest.map(lambda x: x.features)) labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions) val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count()) parsedtest.unpersist() RMSE=math.sqrt(val_err) print("Root Mean Squared Error Test= " + str(RMSE))
all_data = np.array(zip(yy, xx)) sss = ShuffleSplit(len(all_data) - 1, test_size=0.20, random_state=1234) for train_indexes, test_indexes in sss: lparr = [] test_lp_arr = [] sample_data = all_data[train_indexes] test_data = all_data[test_indexes] for medianvalue, record in sample_data: lp = LabeledPoint(medianvalue, tuple(record)) lparr.append(lp) for medianvalue, record in test_data: lp = LabeledPoint(medianvalue, tuple(record)) test_lp_arr.append(lp) training_data = sc.parallelize(lparr).cache() test_data_rdd = sc.parallelize(test_lp_arr).cache() regression_model = GradientBoostedTrees.trainRegressor(training_data, categoricalFeaturesInfo={}, numIterations=10,maxDepth=10) result = regression_model.predict(test_data_rdd.map(lambda x: x.features)) print regression_model print regression_model.toDebugString() print "===============================" predicted_data = result.collect() actual_data = test_data_rdd.map(lambda x: float(x.label)).collect() print mean_absolute_error(actual_data, predicted_data) break
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample") # $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingRegressionModel") sameModel = GradientBoostedTreesModel.load(sc, "target/tmp/myGradientBoostingRegressionModel") # $example off$
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]), ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4 ) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1 ) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4 ) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32 ) with self.assertRaises(Exception): GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1 )
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.linalg import SparseVector from pyspark import SparkContext, SparkConf conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf=conf) sparse_data = [ LabeledPoint(0.0, SparseVector(2, {0: 1.0})), LabeledPoint(1.0, SparseVector(2, {1: 1.0})), LabeledPoint(0.0, SparseVector(2, {0: 1.0})), LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ] data = sc.parallelize(sparse_data) model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10) model.numTrees() model.totalNumNodes() model.predict(SparseVector(2, {1: 1.0})) model.predict(SparseVector(2, {0: 1.0})) rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) print(model.predict(rdd).collect()) model.save(sc, 'model')
# We have to do something here to cache the dataset, otherwise it hangs later on due to a PySpark bug num_records = training_data.count() print(" * Transformed data read!") print(" * Training test ML model... ") # Label the data points labeled_data = training_data.map(lambda x: LabeledPoint(x[-1], x[:-1])) # Separate training and testing data train_data, test_data = labeled_data.randomSplit([0.8, 0.2]) # Do something again to avoid the PySpark bug hang from manifesting num_train_recs = train_data.count() num_test_recs = test_data.count() # Train the model ml_model = GradientBoostedTrees.trainRegressor(train_data, {}, numIterations=20, loss='leastAbsoluteError') print(" * Model trained!") print(" * Testing model error... ") # Predict and calculate error metrics predictions = ml_model.predict(test_data.map(lambda r: r.features)) predictions = predictions.zip(test_data.map(lambda r: r.label)) metrics = RegressionMetrics(predictions) print(" * Model regression error metrics: ") print(" - Mean Absolute Error: %.2f" % metrics.meanAbsoluteError) print(" - Mean Squared Error: %.2f" % metrics.meanSquaredError) print(" - Root Mean Squared Error: %.2f" % metrics.rootMeanSquaredError)
dirfilename = modelDir + rfclassificationfilename; rfModel.save(sc, dirfilename); # Convert to df test_predictions = sqlContext.createDataFrame(predictionAndLabels) test_predictions.registerTempTable("randomForest_results"); ''' ## GRAD BOOSTED TREES ## categoricalFeaturesInfo = {0: 2, 1: 2, 2: 6, 3: 4} gbtModel = GradientBoostedTrees.trainRegressor( indexed_train_reg, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=10, maxBins=32, maxDepth=4, learningRate=0.1) predictions = gbtModel.predict(indexed_test_reg.map(lambda x: x.features)) predictionAndLabels = indexed_test_reg.map(lambda lp: lp.label).zip( predictions) testMetrics = RegressionMetrics(predictionAndLabels) print("RMSE = %s" % testMetrics.rootMeanSquaredError) print("R-sqr = %s" % testMetrics.r2) # Save model datestamp = unicode(datetime.datetime.now()).replace(' ', '').replace(':', '_')
def trainTestSaveALLModel(rddDir, encodedFeaturesParq, featuresNumValsFile): predictors = [] modelType = "" if "batting" in encodedFeaturesParq: modelType = 'batting' predictors = hitterPredictors else: modelType = 'pitching' predictors = pitcherPredictors not_features.extend(predictors) # Load and parse the data file. features = sqlContext.read.parquet(encodedFeaturesParq).cache() print features.take(3) print "# features=", features.count() numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict() (catFeatures, featureLookup) = getCatFeatures(features, numVals) all_fd_points_df = None fd_points_testData = None predictions = None for predictor in predictors: #global predictField #predictField = predictor #data = features.map(toLabeledPoint).coalesce(50) #data = toLabeledPoint(features, predictor).coalesce(50) #print "len data=", data.count() print "catFeatures=", catFeatures # Split the data into training and test sets (30% held out for testing) (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1) #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50) trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50) #testData = f_testData.map(toLabeledPoint).coalesce(50) testData = toLabeledPoint(f_testData, predictor).coalesce(50) testData.cache() print "testData count=", testData.count() playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50) print "playerIds=", playerIds print "playerIds=", playerIds.take(2) print "len playerIds=", playerIds.count() # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=5, numIterations=1, maxBins=300) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)).cache() print "# predictions=", predictions.count() labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) if fd_points_testData is None: fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points)).toDF(['player_id', 'actual_fd_points']).coalesce(50) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count()) print predictor + ' Test Mean Squared Error = ' + str(testMSE) print predictor + ' Test Mean Absolute Error = ' + str(testMAE) if all_fd_points_df is None: #all_fd_points_df = testData.map(lambda x: x.player_id).zip(predictions).toDF(['player_id', predictor]).cache() print "FIRST: # predictions=", predictions.count() print " # playerIds=", playerIds.count() all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache() print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema() print "# all_fd_points_df", all_fd_points_df.count() print "first all_fd_points_df", all_fd_points_df.take(5) print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() else: print "ELSE: # predictions=", predictions.count() print " # playerIds=", playerIds.count() curr_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('curr_fd_points_df') print "all_fd_points_df", all_fd_points_df.printSchema() print "PRE all_fd_points_df", all_fd_points_df.take(5) print "curr_fd_points_df", curr_fd_points_df.printSchema() print "few curr_fd_points_df", curr_fd_points_df.take(5) print "# curr_fd_points_df", curr_fd_points_df.count() print "distinct curr_fd_points_df", curr_fd_points_df.select('player_id').distinct().count() print "first curr", curr_fd_points_df.take(5) #all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, all_fd_points_df.player_id == curr_fd_points_df.player_id, 'inner').drop(curr_fd_points_df.player_id) all_fd_points_df = all_fd_points_df.join(curr_fd_points_df, col("all_fd_points_df.player_id") == col("curr_fd_points_df.player_id")).drop(curr_fd_points_df.player_id).alias('all_fd_points_df').cache() print "second ALL_FD_POINTS_DF", all_fd_points_df.printSchema() #print "all debugstring", all_fd_points_df.rdd.toDebugString() #print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() print "first few all_fd_points_df=", all_fd_points_df.take(3) print "count few all_fd_points_df=", all_fd_points_df.count() print "converted:" print populateDebugString(model, featureLookup) # Save and load model modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest" if modelType == "batting": modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest" try: shutil.rmtree(modelFilename) except OSError: pass model.save(sc, modelFilename) #sameModel = GradientBoostedTreesModel.load(sc, "myModelPath") print "DONE. all_fd_points_df", all_fd_points_df.printSchema() print "# of all_fd_points=", all_fd_points_df.count() print "first of all_fd_points=", all_fd_points_df.take(5) try: shutil.rmtree(rddDir + 'all_fd_points_df.csv') except OSError: pass all_fd_points_df.write.format('com.databricks.spark.csv').save(rddDir + 'all_fd_points_df.csv') allPredictions = None if len(predictors) > 1: allPredictions = all_fd_points_df.map(sumFD).toDF() else: allPredictions = all_fd_points_df.map(renameSumFD).toDF() print allPredictions.rdd.toDebugString() print "predf allPredictions=", allPredictions.take(5) #allPredictions = allPredictions.toDF() try: shutil.rmtree(rddDir + 'allPredictions.csv') except OSError: pass allPredictions.write.format('com.databricks.spark.csv').save(rddDir + 'allPredictions.csv') print "allPredictions=", allPredictions.take(5) print "# of allPredictions=", allPredictions.count() predict_and_actuals = allPredictions.join(fd_points_testData, allPredictions.player_id == fd_points_testData.player_id).drop(fd_points_testData.player_id) print "predict_and_actuals=", predict_and_actuals.take(3) #labelsAndPredictions = all_fd_points_df.map(lambda x: x.fd_points).zip(allPredictions).cache() labelsAndPredictions = predict_and_actuals print "labelsAndPredictions=", labelsAndPredictions.take(3) def mse(x): r = x.asDict() if r['actual_fd_points'] is None: r['actual_fd_points'] = 0.0 return (r['actual_fd_points'] - r['fd_sum']) * (r['actual_fd_points'] - r['fd_sum']) def mae(x): r = x.asDict() if r['actual_fd_points'] is None: r['actual_fd_points'] = 0.0 return abs(r['actual_fd_points'] - r['fd_sum']) testMSE = labelsAndPredictions.map(mse).sum() / float(allPredictions.count()) testMAE = labelsAndPredictions.map(mae).sum() / float(allPredictions.count()) print 'Merged ' + modelType + ' Test Mean Squared Error = ' + str(testMSE) print 'Merged ' + modelType + ' Test Mean Absolute Error = ' + str(testMAE)
from pyspark import SparkContext from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="PythonGradientBoostedTreesRegressionExample") # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") data = MLUtils.loadLibSVMFile(sc, 'data/mllib/newborn2013.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={ 0: 3, 1: 4, 2: 2 }, numIterations=3) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myGradientBoostingRegressionModel") sameModel = GradientBoostedTreesModel.load( sc, "target/tmp/myGradientBoostingRegressionModel")
def extract_label(record): return float(record[-1]) data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) data_with_idx_dt = data_dt.zipWithIndex().map(lambda p: (p[1], p[0])) test_dt = data_with_idx_dt.sample(False, 0.3, 42) train_dt = data_with_idx_dt.subtractByKey(test_dt) train_data_dt = train_dt.map(lambda p: p[1]) test_data_dt = test_dt.map(lambda p: p[1]) #we will train the Gradient Boosted tree model simply using the default arguments to the trainRegressor method gbt_model = GradientBoostedTrees.trainRegressor(train_data_dt, categoricalFeaturesInfo={}, numIterations=10, learningRate=0.01, maxDepth=1, maxBins=2) predictions_GBT = gbt_model.predict(test_data_dt.map(lambda x: x.features)) true_vs_predicted_dt = test_data_dt.map(lambda lp: lp.label).zip(predictions_GBT) print("Gradient Boosted Tree prediction:" + str(true_vs_predicted_dt.take(5))) # Error Calculating Functions # Mean Squared Error def squared_error(actual, pred): return (pred - actual) ** 2 # Mean absolute Error def abs_error(actual, pred): return np.abs(pred - actual)
def evaluate_gbt(train, test, numIterValue, maxDepth, maxBins): gbt_model = GradientBoostedTrees.trainRegressor(train, categoricalFeaturesInfo={}, numIterations=numIterValue, maxDepth=maxDepth, maxBins=maxBins) predictions_GBT = gbt_model.predict(test.map(lambda x: x.features)) labelsAndPredictions_GBT = test.map(lambda lp: lp.label).zip(predictions_GBT) rmsleGBT = np.sqrt(labelsAndPredictions_GBT.map(lambda lp: squared_log_error(lp[0], lp[1])).mean()) return rmsleGBT
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt') model = GradientBoostedTrees.trainRegressor(traindata, categoricalFeaturesInfo={}, numIterations=5) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_rbt") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_rbt") predictions_540 = model.predict(data_540.map(lambda x: x.features))
Train,Test=df.randomSplit([0.8,0.2]) train_data=[];test_data=[] for row in Train.rdd.collect(): train_data.append( LabeledPoint(row[-1], list(row[:-1]))) y_test = [] X_test=[] for row in Test.rdd.collect(): y_test.append(row[-1]) X_test.append(list(row[:-1])) dir() del data_f del data_f del df_filter del history_good_per grm =GradientBoostedTrees.trainRegressor(sc.parallelize(train_data), {}, numIterations=1) grm.save(sc, "file:///data/grm_model.model") pred = list(map(lambda x: grm.predict(x),X_test)) from pyspark.mllib.evaluation import RegressionMetrics predictionAndObservations = sc.parallelize(zip(pred, y_test)) metrics = RegressionMetrics(predictionAndObservations) metrics.meanAbsoluteError metrics.meanSquaredError
trainData, testData = train_test_split(option,test_size=0.2,random_state=42) train = trainData.as_matrix() test = testData.as_matrix() def parsePoint(line): return LabeledPoint(line[7],line[0:7]) # create RDD trainRDD = sc.parallelize(train) testRDD = sc.parallelize(test) trainLP = trainRDD.map(parsePoint) testLP = testRDD.map(parsePoint) # In[122]: # build GB model GBmodel = GradientBoostedTrees.trainRegressor(trainLP, categoricalFeaturesInfo={5:2}, numIterations=3) predictions = GBmodel.predict(testLP.map(lambda x: x.features)) sparkGBError = testLP.map(lambda lp: lp.label).zip(predictions) # compute MSE testMSE = sparkGBError.map(lambda v: (v[0] - v[1])**2).sum() / float(testLP.count()) # In[124]: testMSE # In[111]: # build SVM model from pyspark.mllib.classification import SVMWithSGD, SVMModel
testFinal.collect() #For Getting the threshold limit, Using Train dataset (training1, training2) = trainFinal.randomSplit([0.7, 0.3]) training1.collect() model_1 = RandomForest.trainRegressor(training1, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) model_2 = GradientBoostedTrees.trainRegressor(training1, categoricalFeaturesInfo={}, numIterations=3) model_3 = DecisionTree.trainRegressor(training1, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) predictionsRFTrain = model_1.predict(training1.map(lambda x: x.features)) predictionsGBTTrain = model_2.predict(training1.map(lambda x: x.features)) predictionsDTTrain = model_3.predict(training1.map(lambda x: x.features)) predictionsRFTrain.collect() predictionsGBTTrain.collect() predictionsDTTrain.collect() training1.collect()
def trainTestSaveFDPointsModel(rddDir, encodedFeaturesParq, featuresNumValsFile): modelType = "" if "batting" in encodedFeaturesParq: modelType = 'batting' else: modelType = 'pitching' predictor = 'fd_points' not_features.extend(predictor) # Load and parse the data file. features = sqlContext.read.parquet(encodedFeaturesParq).cache() print features.take(3) print "# features=", features.count() numVals = sqlContext.read.json(featuresNumValsFile).take(1)[0].asDict() (catFeatures, featureLookup) = getCatFeatures(features, numVals) all_fd_points_df = None fd_points_testData = None predictions = None print "catFeatures=", catFeatures # Split the data into training and test sets (30% held out for testing) (f_trainingData, f_testData) = features.randomSplit([0.7, 0.3], seed=1) #trainingData = f_trainingData.map(toLabeledPoint).coalesce(50) trainingData = toLabeledPoint(f_trainingData, predictor).coalesce(50) #testData = f_testData.map(toLabeledPoint).coalesce(50) testData = toLabeledPoint(f_testData, predictor).coalesce(50) testData.cache() print "testData count=", testData.count() playerIds = f_testData.map(lambda x: str(x.player_id) + '_' + x.game_id).coalesce(50) print "playerIds=", playerIds print "playerIds=", playerIds.take(2) print "len playerIds=", playerIds.count() # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo=catFeatures, maxDepth=6, numIterations=32, maxBins=300) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)).cache() print "# predictions=", predictions.count() labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) fd_points_testData = f_testData.map(lambda x: (str(x.player_id) + '_' + x.game_id, x.fd_points or 0.0)).toDF(['player_id', 'actual_fd_points']).coalesce(50) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) testMAE = labelsAndPredictions.map(lambda (v, p): abs(v - p)).sum() / float(testData.count()) print predictor + ' Test Mean Squared Error = ' + str(testMSE) print predictor + ' Test Mean Absolute Error = ' + str(testMAE) # print " # playerIds=", playerIds.count() # all_fd_points_df = playerIds.zip(predictions).toDF(['player_id', predictor]).alias('all_fd_points_df').cache() # print "FIRST ALL_FD_POINTS_DF", all_fd_points_df.printSchema() # print "# all_fd_points_df", all_fd_points_df.count() # print "first all_fd_points_df", all_fd_points_df.take(5) # print "distinct all_fd_points_df", all_fd_points_df.select('player_id').distinct().count() print "converted:" print populateDebugString(model, featureLookup) # Save and load model modelFilename = rddDir + "pitching_" + predictor + "_model.RandomForest" if modelType == "batting": modelFilename = rddDir + "batting_" + predictor + "_model.RandomForest" try: shutil.rmtree(modelFilename) except OSError: pass model.save(sc, modelFilename) fd_points_testData_filename = rddDir + modelType + '_' + 'fd_points_testData.csv' try: shutil.rmtree(fd_points_testData_filename) except OSError: pass fd_points_testData.write.format('com.databricks.spark.csv').option('header', 'true').save(fd_points_testData_filename)
(trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3]) # COMMAND ---------- labeledPoints.collect() # COMMAND ---------- from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils # Train a GradientBoostedTrees model. # Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous. # (b) Use more iterations in practice. model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression GBT model:') print(model.toDebugString()) # COMMAND ---------- from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils