def main(sc, sql_context, is_hive=True): lp_train = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.train") lp_check = MLUtils.loadLabeledPoints(sc, "bintrade.ml.diff.label_point.check") model = GradientBoostedTrees.trainRegressor(lp_train, {}, numIterations=50, maxDepth=10) preds = model.predict(lp_check.map(lambda x: x.features)) labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=False) for each in labels_and_preds.take(100): print each labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy( lambda x: x[1], ascending=True) for each in labels_and_preds.take(100): print each mse = labels_and_preds.map( lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count() print mse mse = labels_and_preds.map( lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count() print mse
def do_POST(self): response_code = 200 response = "" var_len = int(self.headers.get('Content-Length')) content = self.rfile.read(var_len) payload = json.loads(content) # 如果是训练请求,训练然后保存训练完的神经网络 if payload.get('train'): # 转化数据格式 TrainData = "" for d in payload['trainArray'][0]['y0']: TrainData = TrainData + " " + ('%d' % d) TrainData = '%d' % (payload['trainArray'][0]['label'] ) + "," + TrainData.lstrip() + "\n" print(TrainData) Addoutput = open('LabeledPointsdata.txt', 'a') Addoutput.write(TrainData) Addoutput.close() # 如果是预测请求,返回预测值 elif payload.get('predict'): try: training = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata.txt") print('Begin NaiveBayes tranning!') model = NaiveBayes.train(training, 1.0) print('Trainning over!') print(payload['image']) response = { "type": "test", "result": str(model.predict(payload['image'])) } except: response_code = 500 else: response_code = 400 self.send_response(response_code) self.send_header("Content-type", "application/json") self.send_header("Access-Control-Allow-Origin", "*") self.end_headers() if response: self.wfile.write(json.dumps(response)) return
def main(input_file): sc = pyspark.SparkContext(appName="DecisionTree") data = MLUtils.loadLabeledPoints(sc, input_file) trainingData, testData = data.randomSplit([0.70, 0.3]) # Cache in memory for faster training trainingData.cache() model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, maxDepth=16, maxBins=10) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) # print tree_model.toDebugString() print "" print "" print "Test Erros: {}".format(round(testErr,4))
import pyspark from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils sc = pyspark.SparkContext(appName="RandomForest") # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/AmountVectors2/') # Split the data into training and test sets trainingData, testData = data.randomSplit([0.7, 0.3]) trainingData.cache() testData.cache() model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=20, featureSubsetStrategy="auto", impurity='variance', maxDepth=3, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString())
import pyspark from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils sc = pyspark.SparkContext(appName="RandomForest") # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/PartyVectors/') # Split the data into training and test sets trainingData, testData = data.randomSplit([0.7, 0.3]) trainingData.cache() # The depth of the tree proved to be a significant bottle neck model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={}, numTrees=700, featureSubsetStrategy="auto", impurity='gini', maxDepth=8, maxBins=12) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print("") print("") print('Test Error: ' + str(testErr))
from __future__ import print_function import sys import math from pyspark import SparkContext from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="BloodTestReportPythonSVMExample") # 读取数据. print('Begin Load Data File!') sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt") print('Data File has been Loaded!') accuracySex = [] for i in range(0,100): # 将数据随机分割为9:1,分别作为训练数据(training)和预测数据(test). sexTraining, sexTest = sexData.randomSplit([0.9, 0.1]) # 训练线性支持向量机模型. #print('Begin SVM tranning!') sexModel = SVMWithSGD.train(sexTraining, iterations=100) #print('Trainning over!') # 对test数据进行预测,输出预测准确度. sexPredictionAndLabel = sexTest.map(lambda p: (sexModel.predict(p.features), p.label)) accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())