def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins): decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={}, impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions) return decisionTreeModel
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile): filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect() clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect() clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec)) labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1])) trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3]) decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum, categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions) return decisionTreeModel