Exemplo n.º 1
0
labeldPointRDD = lines.map(lambda line:LabeledPoint(extract_label(line),extract_feature(line[3:])))

trainData,validateData,testData =  labeldPointRDD.randomSplit([8,1,1])
print("train data:",trainData.count())
 

#为了加快程序的运行,将数据保存在内存中

trainData.persist()
validateData.persist()
testData.persist()

# 决策树训练模型
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassfier(trainData,numClasses=2,categericalFeatureInfo={},Impurity="entropy",maxDepth=5,maxBins=5)

descDict = {0:"positive",1:"negetive"}

for data in testData:
	predictResult = model.predict(data[1])
	print("predict label",descDict[predictResult])
	

#评估模型
def auc(data):
	from pyspark.mllib.evaluation import BinaryClassificationMetrics
	predict = model.predict(data.map(lambda x:x.feature))
	scoreAndLabel = predict.zip(data.map(lambda x:x.label))
	metrics = BinaryClassificationMetrics(scoreAndLabel)
	return metrics.areaUderROC