예제 #1
0
def main(sc, sql_context, is_hive=True):
    lp_train = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.train")
    lp_check = MLUtils.loadLabeledPoints(sc,
                                         "bintrade.ml.diff.label_point.check")

    model = GradientBoostedTrees.trainRegressor(lp_train, {},
                                                numIterations=50,
                                                maxDepth=10)

    preds = model.predict(lp_check.map(lambda x: x.features))
    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=False)

    for each in labels_and_preds.take(100):
        print each

    labels_and_preds = lp_check.map(lambda x: x.label).zip(preds).sortBy(
        lambda x: x[1], ascending=True)
    for each in labels_and_preds.take(100):
        print each

    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - x[1], 2)).sum() / labels_and_preds.count()
    print mse
    mse = labels_and_preds.map(
        lambda x: math.pow(x[0] - 1.0, 2)).sum() / labels_and_preds.count()
    print mse
예제 #2
0
    def do_POST(self):
        response_code = 200
        response = ""
        var_len = int(self.headers.get('Content-Length'))
        content = self.rfile.read(var_len)
        payload = json.loads(content)

        # 如果是训练请求,训练然后保存训练完的神经网络
        if payload.get('train'):
            # 转化数据格式
            TrainData = ""
            for d in payload['trainArray'][0]['y0']:
                TrainData = TrainData + " " + ('%d' % d)
            TrainData = '%d' % (payload['trainArray'][0]['label']
                                ) + "," + TrainData.lstrip() + "\n"
            print(TrainData)
            Addoutput = open('LabeledPointsdata.txt', 'a')
            Addoutput.write(TrainData)
            Addoutput.close()

        # 如果是预测请求,返回预测值
        elif payload.get('predict'):
            try:
                training = MLUtils.loadLabeledPoints(sc,
                                                     "LabeledPointsdata.txt")
                print('Begin NaiveBayes tranning!')
                model = NaiveBayes.train(training, 1.0)
                print('Trainning over!')
                print(payload['image'])
                response = {
                    "type": "test",
                    "result": str(model.predict(payload['image']))
                }
            except:
                response_code = 500
        else:
            response_code = 400

        self.send_response(response_code)
        self.send_header("Content-type", "application/json")
        self.send_header("Access-Control-Allow-Origin", "*")
        self.end_headers()
        if response:
            self.wfile.write(json.dumps(response))
        return
def main(input_file):

    sc = pyspark.SparkContext(appName="DecisionTree")

    data = MLUtils.loadLabeledPoints(sc, input_file)

    trainingData, testData = data.randomSplit([0.70, 0.3])
    # Cache in memory for faster training
    trainingData.cache()

    model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
                 categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    # print tree_model.toDebugString()
    print ""
    print ""
    print "Test Erros: {}".format(round(testErr,4))
import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

sc = pyspark.SparkContext(appName="RandomForest")

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/AmountVectors2/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()
testData.cache()

model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=20, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=3, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())


import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

sc = pyspark.SparkContext(appName="RandomForest")

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/PartyVectors/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()

# The depth of the tree proved to be a significant bottle neck
model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={},
                                     numTrees=700, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=8, maxBins=12)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print("")
print("")
print('Test Error: ' + str(testErr))
예제 #6
0
from __future__ import print_function

import sys
import math
from pyspark import SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.util import MLUtils


if __name__ == "__main__":

    sc = SparkContext(appName="BloodTestReportPythonSVMExample")

    # 读取数据.
    print('Begin Load Data File!')
    sexData = MLUtils.loadLabeledPoints(sc, "LabeledPointsdata_sex.txt")
    print('Data File has been Loaded!')

    accuracySex = []

    for i in range(0,100):
        # 将数据随机分割为9:1,分别作为训练数据(training)和预测数据(test).
        sexTraining, sexTest = sexData.randomSplit([0.9, 0.1])

        # 训练线性支持向量机模型.
        #print('Begin SVM tranning!')
        sexModel = SVMWithSGD.train(sexTraining, iterations=100)
        #print('Trainning over!')
        # 对test数据进行预测,输出预测准确度.
        sexPredictionAndLabel = sexTest.map(lambda p: (sexModel.predict(p.features), p.label))
        accuracySex.append(1.0 * sexPredictionAndLabel.filter(lambda (x, v): x == v).count() / sexTest.count())