def evaluate_dt(train,test,maxDepth,maxBins): model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins) preds = model.predict(test.map(lambda p:p.features)) actual = test.map(lambda p:p.label) tp = actual.zip(preds) rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean()) return rmsle
def trainClassifier(self): # get the current time current = time() # get the tags tags = self.tags numeric = self.numeric x = self.x y = self.y # get the training data training_data = self.training_labeled # start training the tree model self.tree_model = DecisionTree.trainClassifier( training_data, numClasses=4, categoricalFeaturesInfo={0 : len(tags), 1 : len(numeric), 2 : len(x), 3 : len(y)}, impurity="gini", maxDepth=5, maxBins=1000) print self.tree_model # total time total = time() - current print "Classifier trained in {} seconds.".format(round(total, 3)) # start evaluating the model self.evaluate()
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42) # Train the models decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2) # Test the model testDecisionTree(decisionTreeModel, parallelized_test_set)
def generateDecisionTree(): if os.path.exists(DT_PATH): print("DT_PATH Already available") return global model data = sc.textFile(F_PATH).map(parseLine) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L) model = DecisionTree.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ', str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) modelStatistics(labelsAndPredictions) # Save and load model model.save(sc, DT_PATH) print("Decision Tree model saved!")
def decisionTree(trainingRDD, trainingRDDHashed, testRDDHashed, testRDD): # Get size of RDD nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Train the Decision Tree Model trainedModel = DecisionTree.trainClassifier( trainingRDD, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=2, maxBins=3) # Test the Model on the Training Set predictions = trainedModel.predict(trainingRDD.map(lambda x: x.features)) labelsAndPredictions = trainingRDD.map( lambda lp: lp.label).zip(predictions).countByValue() # Map to Dictionary for obtaining Results resultsValidation = defaultdict(lambda: 0, labelsAndPredictions) nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Get F-Score and Accuracy Value AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV) # Test the Model on the Test Set predictions = trainedModel.predict(testRDD.map(lambda x: x.features)) labelsAndPredictions = testRDD.map( lambda lp: lp.label).zip(predictions).countByValue() # Map to Dictionary for obtaining Results resultsTest = defaultdict(lambda: 0, labelsAndPredictions) AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT) # Print Results print(' Results for Decision Tree') print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV) print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT) # Return the Result List return AccuracyV, fScoreV, AccuracyT, fScoreT
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def trainModel(trainingData): print '\nTraining Decision Tree model started' Utils.logTime() model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5,maxBins=32) print '\nTraining Decision Tree model finished' Utils.logTime() return model
def RunDecisionTree(tf): rdd = tf.map(parseAsLabeledPoints) train, test = rdd.randomSplit([.8, .2]) model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=100) predictions = model.predict(train.map(lambda x: x.features)) labelsAndPredictions = train.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) print('Training Error = ' + str(trainErr))
def train(self, num_classes=2, categorical_features=None, max_depth=5): categorical_features = categorical_features or {} model = DecisionTree.trainClassifier( self._labeled_feature_vector_rdd(), numClasses=num_classes, categoricalFeaturesInfo=categorical_features, maxDepth=max_depth) return DecisionTreeModel(model, self.feature_cols)
def RunDecisionTree(tf): rdd = tf.map(parseAsLabeledPoints) train, test = rdd.randomSplit([.8, .2]) numCat = len(genCats) model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) # Evaluate model on training instances and compute training error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) print('Accuracy of decision tree = ', 1-trainErr) print('Training Error = ' + str(trainErr))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def regression(sc, sample): traindata = sc.parallelize(sample) traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0])) testdata = [8.2] ##### # linear_model = LinearRegressionWithSGD.train(traindata,iterations=10) # prediction = linear_model.predict(testdata) # print prediction ##### decision_model = DecisionTree.trainRegressor(traindata,{}) prediction = decision_model.predict(testdata) print prediction
def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins): decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={}, impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions) return decisionTreeModel
def classify(sc, sample): def ff(x): newsample = [] nl = ["rainy","sad","lack"] ml = ["cloudy","soso","enough"] pl = ["sunny","happy","most"] for i in x: if i in nl: newsample.append(0) elif i in ml: newsample.append(1) elif i in pl: newsample.append(2) return newsample f = lambda x:1 if x=="yes" else 0 traindata = sc.parallelize(sample).map(lambda x:(ff(x[0]),f(x[1]))) traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0])) testdata = traindata.first() print testdata ###### # print "logistic" # lrModel = LogisticRegressionWithSGD.train(traindata, 10) # prediction = lrModel.predict(testdata.features) # print prediction ##### # print "svm" # svmModel = SVMWithSGD.train(traindata, 10) # prediction = svmModel.predict(testdata.features) # print prediction # # # #### # print "naive bayes" # nbModel = NaiveBayes.train(traindata) # prediction = nbModel.predict(testdata.features) # print prediction # # # #### print "decesion tree" detreeModel = DecisionTree.trainClassifier(traindata, 2, {}) prediction = detreeModel.predict(testdata.features) print prediction
def main(input_file): sc = pyspark.SparkContext(appName="DecisionTree") data = MLUtils.loadLabeledPoints(sc, input_file) trainingData, testData = data.randomSplit([0.70, 0.3]) # Cache in memory for faster training trainingData.cache() model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, maxDepth=16, maxBins=10) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) # print tree_model.toDebugString() print "" print "" print "Test Erros: {}".format(round(testErr,4))
def trainModel(self, vectSpace, path): try: if self.type == 'NaiveBayes': model = NaiveBayes.train(vectSpace) elif self.type == 'DecisionTree': model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5) if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) model.save(self.sc, path) except: print "Unexpected error:", sys.exc_info()[0] raise return model
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile): filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect() clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect() clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec)) labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1])) trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3]) decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum, categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions) return decisionTreeModel
def create_model(name, training): if name == 'logistic': print_box() print "Logistic Regression Model" print_box() model = LogisticRegressionWithLBFGS.train(training) elif name == 'tree': print_box() print "Decision Tree Model" print_box() model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) elif name == 'rf': print_box() print "Random Forest Model" print_box() model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50) return model
def trainOptimalModel(trainingData, testData): print "\nTraining optimal Decision Tree model started!" Utils.logTime() impurityVals = ['gini', 'entropy'] maxDepthVals = [3,4,5,6,7] maxBinsVals = [8,16,32] optimalModel = None optimalMaxDepth = None optimalImpurity = None optimalBinsVal = None minError = None try: for curImpurity in impurityVals: for curMaxDepth in maxDepthVals: for curMaxBins in maxBinsVals: model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity=curImpurity, maxDepth=curMaxDepth, maxBins=curMaxBins) testErr, PR, ROC = Evaluation.evaluate(model, testData) if testErr < minError or not minError: minError = testErr optimalImpurity = curImpurity optimalMaxDepth = curMaxDepth optimalBinsVal = curMaxBins optimalModel = model except: msg = "\nException during model training with below parameters:" msg += "\timpurity: " + str(curImpurity) msg += "\tmaxDepth: " + str(curMaxDepth) msg += "\tmaxBins: " + str(curMaxBins) Utils.logMessage(msg) logMessage(optimalModel, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError) return optimalModel
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
sc = SparkContext() result = {1.0: 'yes', 0.0: 'no'} # 机器学习实战第三章中的鱼类归属数据源 data = [ LabeledPoint(1, [1, 1]), LabeledPoint(1, [1, 1]), LabeledPoint(0, [1, 0]), LabeledPoint(0, [0, 1]), LabeledPoint(0, [0, 1]) ] rdd = sc.parallelize(data) print '------------------------------------' print type(rdd), dir(rdd) print rdd.collect() print '------------------------------------' model = DecisionTree.trainClassifier(rdd, 3, {}) # print(model) print '********************************************************' print(model.toDebugString()) print "test [1,0]: %s" % (result[model.predict(array([1, 0]))]) print "test [1,1]: %s" % (result[model.predict(array([1, 1]))]) print "test [0,0]: %s" % (result[model.predict(array([0, 0]))]) print '********************************************************' sc.stop()
conf = SparkConf().setAppName(appName).setMaster("local[2]") #at least 2 sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # separate the classification label and the actual data def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:]) # training the model data = sc.textFile(learning_data_file) parsedData = data.map(parsePoint) model = (DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={2:9}, impurity='gini', maxDepth=30)) """ model = (RandomForest.trainClassifier(parsedData, numClassesForClassification=2, numTrees=6, categoricalFeaturesInfo={2:10}, impurity='gini', maxDepth=30)) """ print "====================== model trained ======================" # streaming and parsing text lines = ssc.socketTextStream(HOST, QUERY_PORT) vectors = lines.flatMap(lambda x:x.split(',')).map(lambda l:float(l))
labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) ## 0.555395278766 ############################ Decision TREE ############################## from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[-1], values[0:9]) data = sc.textFile("/Users/mac/Desktop/USF/MSAnalytics/Spring1/ML2/ML Project/plays.csv") header = data.first() data = data.filter(lambda x: x != header) parsedData = data.map(parsePoint) model = DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=30, maxBins=100) # Evaluate model on training instances and compute training error predictions = model.predict(parsedData.map(lambda x: x.features)) labelsAndPredictions = parsedData.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedData.count()) #0.09 print('Training Error = ' + str(trainErr)) print('Learned classification tree model:') print(model)
row = [] while column_number < 200: if array[m + column_number] == 1: row.append(x[column_number]) index += 1 column_number += 1 return LabeledPoint(train_labels_array[j], row) trainingData = train_data.map(f) test = test_data.map(g) # Train model using DECISION TREE model = DecisionTree.trainClassifier(trainingData, numClasses=12613, categoricalFeaturesInfo={}, impurity='gini', maxDepth=2, maxBins=32) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) # Obtain the accuracy test_accuracy = labelsAndPredictions.filter( lambda lp: lp[0] == lp[1]).count() / float(test.count()) accuracies.append(test_accuracy) m += 200 # LAST individual individual = individual4 z = 0
# Split each line into a list based on the comma delimiters csvData = rawData.map(lambda x: x.split(",")) # Convert these lists to LabeledPoints trainingData = csvData.map(createLabeledPoints) # Create a test candidate, with 10 years of experience, currently employed, # 3 previous employers, a BS degree, but from a non-top-tier school where # he or she did not do an internship. You could of course load up a whole # huge RDD of test candidates from disk, too. testCandidates = [ array([10, 1, 3, 1, 0, 0])] testData = sc.parallelize(testCandidates) # Train our DecisionTree classifier using our data set model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, impurity='gini', maxDepth=5, maxBins=32) # Now get predictions for our unknown candidates. (Note, you could separate # the source data into a training set and a test set while tuning # parameters and measure accuracy as you go!) predictions = model.predict(testData) print ('Hire prediction:') results = predictions.collect() for result in results: print result # We can also print out the decision tree itself: print('Learned classification tree model:') print(model.toDebugString())
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns dimension = dimension_columns[0] all_dimensions = self._dimension_columns all_measures = self._measure_columns cat_feature_info = [] columns_without_dimension = list(x for x in all_dimensions if x != dimension) mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() for column in all_dimensions: mapping_dict[column] = dict( enumerate( self._data_frame.select(column).distinct().rdd.map( lambda x: str(x[0])).collect())) # for c in mapping_dict: # name = c # reverseMap = {v: k for k, v in mapping_dict[c].iteritems()} # udf = UserDefinedFunction(lambda x: reverseMap[x], StringType()) # self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns]) # converting spark dataframe to pandas for transformation and then back to spark dataframe pandasDataFrame = self._data_frame.toPandas() for key in mapping_dict: pandasDataFrame[key] = pandasDataFrame[key].apply( lambda x: 'None' if x == None else x) reverseMap = {v: k for k, v in mapping_dict[key].items()} pandasDataFrame[key] = pandasDataFrame[key].apply( lambda x: reverseMap[x]) # sqlCtx = SQLContext(self._spark) self._data_frame = self._spark.createDataFrame(pandasDataFrame) self._mapping_dict = mapping_dict for c in columns_without_dimension: cat_feature_info.append( self._data_frame.select(c).distinct().count()) if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) dimension_classes = self._data_frame.select( dimension).distinct().count() self._data_frame = self._data_frame[[dimension] + columns_without_dimension + all_measures] data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=3, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame) self.generate_probabilities(decision_tree, dimension) # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree) # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_params(decision_tree, self._new_rules, self._total, self._success, self._probability) return decision_tree_result
print "Decision Tree feature vector length: " + str( len(first_point_tree.features)) # In[167]: from pyspark.mllib.tree import DecisionTree #from the RDD sample 20% for training and rest for test records_tree_with_idx = data_tree.zipWithIndex().map(lambda (k, v): (v, k)) test_tree_idx = records_tree_with_idx.sample(False, 0.2, 42) training_tree_idx = records_tree_with_idx.subtractByKey(test_tree_idx) test_tree = test_tree_idx.map(lambda (idx, p): p) training_tree = training_tree_idx.map(lambda (idx, p): p) model_tree = DecisionTree.trainRegressor(training_tree, {}) preds_tree = model_tree.predict(test_tree.map(lambda p: p.features)) actual_tree = test_tree.map(lambda p: p.label) true_vs_predicted_tree = actual_tree.zip(preds_tree) print "Decision Tree predictions: " + str(true_vs_predicted_tree.take(5)) print "Decision Tree depth: " + str(model_tree.depth()) print "Decision Tree number of nodes: " + str(model_tree.numNodes()) # In[177]: mse_tree = true_vs_predicted_tree.map(lambda (t, p): squared_error(t, p)).mean() mae_tree = true_vs_predicted_tree.map(lambda (t, p): abs_error(t, p)).mean()
sc = SparkContext(appName="PythonDecisionTreeClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'carbon2.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={ 0: 5, 1: 5 }, impurity='entropy', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns self._target_dimension = dimension_columns[0] dimension = self._target_dimension #####Look into it for Issue 947################# max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [ dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels ] all_measures = self._measure_columns if self._pandas_flag: self._data_frame = self._data_frame[all_dimensions + all_measures] cat_feature_info = [] columns_without_dimension = [ x for x in all_dimensions if x != dimension ] mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() decision_tree_result.set_freq_distribution( self._metaParser.get_unique_level_dict(self._target_dimension), self._important_vars) if self._pandas_flag: try: all_dimensions.remove(dimension) except: pass actual_cols = list(self._data_frame.columns) print(actual_cols) self._data_frame = pd.get_dummies(self._data_frame, columns=all_dimensions) after_dummy_cols = list(self._data_frame.columns) def Diff(li1, li2): return (list( list(set(li1) - set(li2)) + list(set(li2) - set(li1)))) decision_tree_result.dummy_cols = [ Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)), all_dimensions ] all_dimensions.append(dimension) #this has been done for scoring error if self._pandas_flag: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, [dimension], self._pandas_flag) else: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, all_dimensions, self._pandas_flag) if self._pandas_flag: print(self._data_frame.head(1)) else: print(self._data_frame.show(1)) # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'} standard_measure_index = { 0.0: 'Low', 1.0: 'Below Average', 2.0: 'Average', 3.0: 'Above Average', 4.0: 'High' } for measure in all_measures: mapping_dict[measure] = standard_measure_index for k, v in list(mapping_dict.items()): temp = {} for k1, v1 in list(v.items()): self._alias_dict[v1.replace(",", "")] = v1 temp[k1] = v1.replace(",", "") mapping_dict[k] = temp self._mapping_dict = mapping_dict if not self._pandas_flag: for c in columns_without_dimension: if self._pandas_flag: cat_feature_info.append(len(self._data_frame[c].unique())) else: cat_feature_info.append( self._data_frame.select(c).distinct().count()) for c in all_measures: cat_feature_info.append(5) columns_without_dimension = columns_without_dimension + all_measures all_measures = [] if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 else: decision_tree_result.mappingdict = mapping_dict[dimension] max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) if self._pandas_flag: dimension_classes = len(self._data_frame[dimension].unique()) else: dimension_classes = self._data_frame.select( dimension).distinct().count() if not self._pandas_flag: self._data_frame = self._data_frame[[dimension] + columns_without_dimension + all_measures] print("=" * 200) # print self._data_frame.rdd.first() print("numClasses", dimension_classes) print("maxDepth", self._maxDepth) decision_tree_result._maxDepth = self._maxDepth print("maxBins", max_length) print("=" * 200) if self._pandas_flag: self._data_frame.columns = [ re.sub('\W+', '_', col.strip()) for col in self._data_frame.columns ] x = self._data_frame.drop(dimension, axis=1) y = self._data_frame[dimension] #tle = LabelEncoder() #y = tle.fit_transform(y) for i in x.columns: x[i] = x[i].fillna(x[i].mode()[0]) model = DecisionTreeClassifier(criterion='gini', max_depth=self._maxDepth, random_state=42) model = model.fit(x, y) output_result = self.tree_to_code(model, list(x.columns)) output_result = list(map(lambda x: x.strip(), output_result)) else: data = self._data_frame.rdd.map( lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame, self._pandas_flag) self._new_tree = self.generate_new_tree(decision_tree) node_list = self.node_name_extractor(self._new_tree) node_list = list(self.flatten(node_list)) correct_count_list = [i[0] for i in self._count_list] tree_dict = dict(list(zip(node_list, correct_count_list))) self._new_tree = self.wrap_tree(self._new_tree, tree_dict) self._path_dict = self.path_dict_creator(node_list, self._new_tree) print("===" * 40) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability, self._path_dict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["script"] * self._scriptStages["treegeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "treegeneration",\ "info",\ self._scriptStages["treegeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) return decision_tree_result
# Append Labels appendColumn(ensemble_test, rf_test_predict_label) appendColumn(ensemble_train, rf_train_predict_label) # Decision Trees # C13 - C21 # Build the Model max_depth = [5, 10, 15, 20] for i in range(0, len(max_depth), 1): m_depth = max_depth[i] # Build the Model model = DecisionTree.trainClassifier(train_data, 10, {}, impurity='gini', maxDepth=m_depth) rf_train_predict_label = [] rf_test_predict_label = [] # Predict Labels for j in range(0, len(test_features), 1): p_l = model.predict(test_features[j]) rf_test_predict_label.extend([p_l]) for j in range(0, len(train_features), 1): p_l = model.predict(train_features[j]) rf_train_predict_label.extend([p_l]) # Append Labels
#exec(open("./doweathclass_dectree.py").read()) # ---------------- now try decision tree ------------ from pyspark.mllib.tree import DecisionTree dt_model = DecisionTree.trainClassifier(datax_rdd, 2, {}, impurity='entropy', maxDepth=3, maxBins=32, minInstancesPerNode=2) #maxDepth and maxBins #{} could be categorical feature list, # To do regression, have no numclasses,and use trainRegression function print(dt_model.toDebugString()) #results in this: #DecisionTreeModel classifier of depth 3 with 9 nodes # If (feature 1 <= 0.0) # If (feature 4 <= 80.0) # If (feature 3 <= 68.0) # Predict: 0.0 # Else (feature 3 > 68.0) # Predict: 1.0 # Else (feature 4 > 80.0) # If (feature 0 <= 0.0) # Predict: 0.0 # Else (feature 0 > 0.0) # Predict: 0.0 # Else (feature 1 > 0.0) # Predict: 1.0
# In[ ]: # In[53]: (trainingData, testData) = fdata.randomSplit([0.8, 0.2]) # Use the decision tree classifier to train the model # In[54]: from pyspark.mllib.tree import DecisionTree # In[55]: model = DecisionTree.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={}) # In[56]: predictions = model.predict(testData.map(lambda row: row.features)) # Create Confusion Matrix to evaluate the accuracy of the model # We create a matrix containing the test labels as a first column (real values) and predicted values as second column # In[57]: predictionsAndLabels = testData.map( lambda labeledpoint: labeledpoint.label).zip(predictions)
# Load data. dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, categoricalFeaturesInfo=categoricalFeaturesInfo) # Print learned tree and stats. print origToNewLabels print "Trained DecisionTree for classification:" print " Model numNodes: %d" % model.numNodes() print " Model depth: %d" % model.depth() print " Training accuracy: %g" % getAccuracy(model, reindexedData) if model.numNodes() < 20: print model.toDebugString() else: print model # testdata = MLUtils.loadLibSVMFile(sc, 'test_svm2') data = numpy.genfromtext('test_svm2', delimiter=',') #reuben rdd = sc.parallelize(data) model.predict(rdd).collect()
def train_validate_test_rpart(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406 sample_from_benign = benign.sample(False, 50000/n_benign) pat_proc = anom.unionAll(sample_from_benign) print("pat_proc.count() = " + str(pat_proc.count())) #99,227 all_columns = pat_proc.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 pat_proc = pat_proc.rdd print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = pat_proc.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) #Populate the actual categoricalFeaturesInfo dictionary cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()]) procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))]) cat_features_info = dict(cat_features_info.items() + procedure_features_info.items()) t0 = time() model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all". tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds. print(model) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size) fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272. #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018 print model.toDebugString() except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return model
# print('\n== ACCURACY BAYES : ', accuracy_bayes , '==') # # file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n") # file.write('\n-> ACCURACY BAYES : ' + str(accuracy_bayes) + '\n') # print("\n===================================================== ") print("=================== DECISION TREE =================== ") print("===================== (Entropy) ===================== ") print("=====================================================\n") print("\n=================== Training ================== \n") model_decision_tree_entropy = DecisionTree.trainClassifier( training, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=5, numClasses=2) print("Done : DT entropy training") print("\n========= Test on Brexit labeled data ========= ") #decision tree entropy labeled_prediction_entropy = test_tlabels_brexit.zip( model_decision_tree_entropy.predict(tfidf_test_brexit)).map( lambda x: { "actual": x[0], "predicted": x[1] }) accuracy_entropy = 1.0 * labeled_prediction_entropy.filter( lambda doc: doc["actual"] == doc['predicted']).count(
attack = 0.0 if len(line_split) >= 9 and line_split[9] == 'title': attack = 1.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) # Build the model t0 = time() tree_model = DecisionTree.trainClassifier( training_data, numClasses=2, categoricalFeaturesInfo={0: len(protocols)}, impurity='gini', maxDepth=4, maxBins=100) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt, 3)) predictions = tree_model.predict(test_data.map(lambda p: p.features)) labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float( test_data.count()) tt = time() - t0
# get 90% train and 10% test data data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.1, 42) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p): p) train_size = train_data.count() test_size = test_data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) # make decision tree model dt_model = DecisionTree.trainRegressor(train_data, {}) # make predictions and measure error preds = dt_model.predict(test_data.map(lambda p: p.features)) actual = test_data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def squared_log_error(pred, actual):
help(LinearRegressionWithSGD.train) help(DecisionTree.trainRegressor) # ## Train a Regression Model on the Bike Sharing Dataset linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) print("Linear Model predictions: " + str(true_vs_predicted.take(5))) # we pass in an mepty mapping for categorical feature size {} dt_model = DecisionTree.trainRegressor(data_dt, {}) #여기서 에러 뜨네... preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print("Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))) print("Decision Tree depth: " + str(dt_model.depth())) print("Decision Tree number of nodes: " + str(dt_model.numNodes())) # 자 이제, linear regression/decisiontree 의 잔차 제곱을 통해 성능을 비교해보자. # ## Perfomance Metrics # set up performance metrics functions def squared_error(actual, pred):
if fields[6] == "Y": hired = 1 else: hired = 0 return LabeledPoint(hired, [years_of_exp,employed,previousEmployers,education_level,top_tier_school,internship]) path = '/home/sejal/Documents/datascience/dataset/data/emp/candidates_hired_past.csv' r1 = sc.textFile(path) r2 = r1.map(lambda entry: entry.split(',')) training_data = r2.map(prepare_data_for_DT) test_data = [10,1,2,2,1,0] model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1:2, 3:4,4:2,5:2}) predictions = model.predict(test_data) print("Hire OR No-Hire") print (predictions) print (model.toDebugString()) # results = predictions.collect() # for result in results: # print result
pd.DataFrame(dfd.take(5), columns=dfd.columns).transpose() def labelData(data): return data.map(lambda row: LabeledPoint(row[9], [ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[10], row[11], row[12], row[13], row[14], row[15] ])) trainData, testData = labelData(dfd).randomSplit([0.8, 0.2]) model = DecisionTree.trainClassifier(trainData, numClasses=4, maxDepth=6, categoricalFeaturesInfo={}, impurity='gini', maxBins=50) print model.toDebugString() def getPredictionLabels(model, testData): predictions = model.predict(testData.map(lambda r: r.features)) return predictions.zip(testData.map(lambda r: r.label)) def printMetrics(pred_and_label): metrics = MulticlassMetrics(pred_and_label) print 'Preicision of 1', metrics.precision(1) print 'Preicision of 2', metrics.precision(2)
attack = 1.0 if line_split[41] == 'normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) t0 = time() tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={ 1: len(protocols), 2: len(services), 3: len(flags) }, maxDepth=4, maxBins=100) tt = time() - t0 print("Classifier trained in {} seconds".format(round(tt, 3))) t0 = time() predictions = tree_model.predict(test_data.map(lambda x: x.features)) labels_and_preds = test_data.map(lambda x: x.label).zip(predictions) test_accuracy = labels_and_preds.filter( lambda x: x[0] == x[1]).count() / float(test_data.count()) tt = time() - t0 print("Prediction made in {} seconds. Test accuracy is {}".format( round(tt, 3), round(test_accuracy, 3)))
print '决策树特征向量长度: ' + str(len(first_point_dt.features)) from pyspark.mllib.regression import LinearRegressionWithSGD from pyspark.mllib.tree import DecisionTree help(LinearRegressionWithSGD.train) linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map( lambda point: (point.label, linear_model.predict(point.features))) print '线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5)) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print '决策树回归模型对前5个样本的预测值: ' + str(true_vs_predicted_dt.take(5)) print '决策树模型的深度: ' + str(dt_model.depth()) print '决策树模型的叶子节点个数: ' + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def abs_error(actual, pred): return np.abs(pred - actual)
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
LR_model = LogisticRegressionWithLBFGS.train(trained_hashed) LR_prediction_and_labels = check_hashed.map(lambda point: (LR_model.predict(point.features), point.label)) LR_correct = LR_prediction_and_labels.filter(lambda predicted, actual: predicted == actual) LR_accuracy = LR_correct.count() / float(check_hashed.count()) print ("LR training accuracy:" + str(LR_accuracy * 100) + " %") LR_output_dir = 'hdfs://master:9000/user/hadoop/LogisticRegression' shutil.rmtree("hdfs://master:9000/user/hadoop/LogisticRegression/metadata", ignore_errors=True) LR_model.save(cc, LR_output_dir) SVM_model = SVMWithSGD.train(trained_hashed, iterations=10) SVM_prediction_and_labels = check_hashed.map(lambda point: (SVM_model.predict(point.features), point.label)) SVM_model.clearThreshold() SVM_correct = SVM_prediction_and_labels.filter(lambda predicted, actual: predicted == actual) SVM_accuracy = SVM_correct.count() / float(check_hashed.count()) print ("SVM training accuracy:" + str(SVM_accuracy * 100) + " %") SVM_output = 'hdfs://master:9000/user/hadoop/SVM' shutil.rmtree("hdfs://master:9000/user/hadoop/SVM/metadata", ignore_errors=True) SVM_model.save(cc, SVM_output) model = DecisionTree.trainClassifier(trained_hashed, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) predictions = model.predict(check_hashed.map(lambda x: x.features)) labelsAndPredictions = check_hashed.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(check_hashed.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) model.save(cc, "hdfs:///user/hadoop/DT")
# get 90% train and 10% test data data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.1) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p) : p) train_size = train_data.count() test_size = test_data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) # make decision tree model dt_model = DecisionTree.trainRegressor(train_data,{}) # make predictions and measure error preds = dt_model.predict(test_data.map(lambda p: p.features)) actual = test_data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def squared_log_error(pred, actual): return (np.log(pred + 1) - np.log(actual + 1))**2
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
#ArrDelay is our response #ArrDelay becomes the 8tth column now, and total columns in the data = 12 label = clean_line_split[0] nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Narrow-2008")
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32) with self.assertRaises(Exception) as cm: GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils # <codecell> data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt').cache() # <codecell> # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100) # <codecell> # Evaluate model on training instances and compute training error predictions = model.predict(data.map(lambda x: x.features)) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) trainMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data.count()) print('Training Mean Squared Error = ' + str(trainMSE)) print('Learned regression tree model:') print(model) # <codecell>
vector) #se precisar de feature do Feature Selection data = pass2libsvm(reduced, sc.parallelize(classes)) #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector) (trainingData, testData) = data.randomSplit([0.7, 0.3]) print 'data devided' #trainingData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-16.out',15)) #testData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-25.out',15)) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numberClasses, {}) #, maxDepth=5, maxBins=32) # let lrm be a LogisticRegression Model #model.save(sc, "hdfs://master:9000/user/app/model-"+str(sys.argv[2]+".model")) print 'model done' #to load the model #sameModel = DecisionTreeModel.load(sc, "lrm_model.model") # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) metrics = MulticlassMetrics(labelsAndPredictions)
header = train_rawDataWithHeader.first() rawData = train_rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) train_data = rData.map(lambda x: x.split(",")) #test_data=prepare_data(test) header = test_rawDataWithHeader.first() rawData = test_rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) test_data = rData.map(lambda x: x.split(",")) lines=train_data+test_data categoriesMap = lines.map(lambda fields: fields[0]).distinct().zipWithIndex().collectAsMap() train_RDD=train_data.map(lambda r:LabeledPoint(extract_label(r),extract_features(r,categoriesMap,len(r)-1))) test_RDD=test_data.map(lambda r: (extract_features(r,categoriesMap,len(r)-1),r[-1])) model=DecisionTree.trainClassifier(train_RDD,numClasses=2,categoricalFeaturesInfo={},impurity='entropy',maxDepth=14,maxBins=9) count = 0 num = 0 positive = 0 negative = 0 truePositive = 0 trueNegative =0 falsePositive = 0 falseNegative = 0 for data in test_RDD.take(test_data.count()): num+=1 preds = int(model.predict(data[0])) #print(str(preds)+' '+str(data[1])) if(preds == int(data[1])):count=count+1 if int(data[1]) == 0: negative += 1
def main(): spark = SparkSession\ .builder\ .appName("PythonSQL")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() #Import a messed up DataFrame (DF) to get Column info raw_df= sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferchema='true') \ .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz") features = raw_df.columns #Manually specify the correct datatypes for each Column and import new DF fields = [ StructField(field_name, FloatType(), True) for field_name in features ] fields[0].dataType = IntegerType() fields[-1].dataType = IntegerType() customSchema = StructType(fields) df = sqlContext.read.format('com.databricks.spark.csv').options(header='true') \ .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz",schema = customSchema) df.na.fill(NaN) #Prepare feature for computation! #Remove features from a list precompiled on correlation criterion! counter = 0 with open('column_refine_list.csv', 'r') as f: csvlist = csv.reader(f, delimiter=',') for item in csvlist: column_to_go = item[:] print("Total numer of features to be removed: %d" % (len(column_to_go))) print("\n") for item in column_to_go: df = df.drop(item) print("Final number of features is: %d" % (len(df.columns[1:-1]))) #Decision Tree model Training training_points = labelData(df) #csv_numeric.map(ReducedlabelData) training_data, test_data = training_points.randomSplit([0.7, 0.3]) print(training_data.first()) t0 = time() tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=04, maxBins=100) tt = time() - t0 tree_model.save( sc, "DTmodel_model_reduced_222red") #Save the model for future use! print("Model trained in : %.4f Sec" % (tt)) print(tree_model.toDebugString()) #Making predictions on the test set ## Predict t0 = time() labels_and_preds = getLabelsPredictions(tree_model, test_data) test_accuracy = 100 * labels_and_preds.filter( lambda (v, p): v == p).count() / float(test_data.count()) print("Test accuracy is : %.4f " % (test_accuracy)) printMCC(labels_and_preds) tt = time() - t0 print("Predictions and metrics computed in : %.4f Sec" % (tt)) features_new = raw_df.columns #Manually specify the correct datatypes for each Column and import new DF fields = [ StructField(field_name, FloatType(), True) for field_name in features_new ] fields[0].dataType = IntegerType() fields[-1].dataType = IntegerType() customSchema = StructType(fields) df.select( df.columns ).write.format('com.databricks.spark.csv').options(header='true').save( '/data/ganesh/BigData/Bosch/Source/feature_reduction/numeric/feature_reduction_V2/df_739.csv', schema=customSchema) sc.stop()
# 查看資料前處理結果 # print(labelpointRDD.first()) # 以randomSplit隨機方式,依照3:1 (75%:25%) 比例,將資料分為train set與test set (trainData, testData) = labelpointRDD.randomSplit([3, 1]) # print(testData.count()) # 為加快程式的執行效率,將train set與test set暫存在記憶體中 trainData.persist() testData.persist() # tune參數 params = [5, 10, 15, 20] for i in params: # 使用Spark MLlib支援的決策樹 model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=i, maxBins=15) # 使用model.predict對testDat作預測 score = model.predict(testData.map(lambda p: p.features)) # 印出預測的結果 # score.foreach(print) print(score.collect()) # all # print(score.take(2)) # first two # print(score.count()) # 有幾筆 # 將預測結果與真實label結合起來 scoreAndLabels = score.zip(testData.map(lambda p: p.label)) # 使用MulticlassMetrics做出confusionMatrix,計算Accuracy,Recall,Precision metrics = MulticlassMetrics(scoreAndLabels) print(metrics.confusionMatrix()) print("Accuracy = %s" % metrics.accuracy) print("Recall = %s" % metrics.recall(0)) print("Precision = %s" % metrics.precision(0))
def test_all(self, measure_columns=None, dimension_columns=None): if dimension_columns is None: dimensions = self._dimension_columns self._target_dimension = measure_columns[0] dimension = self._target_dimension max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [ dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels ] all_measures = [ x for x in self._measure_columns if x != self._target_dimension ] self.transform_data_frames() decision_tree_result = DecisionTreeResult() cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions] if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) # print cat_feature_info if self._pandas_flag: dimension_classes = self._data_frame[dimension].nunique() self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures] x = self._data_frame.drop(dimension, axis=1) y = self._data_frame[dimension] for i in x.columns: x[i] = x[i].fillna(x[i].mode()[0]) model = DecisionTreeRegressor(max_depth=6) model = model.fit(x, y) output_result = self.tree_to_code(model, list(x.columns)) output_result = list(map(lambda x: x.strip(), output_result)) print(output_result, "output_result") else: dimension_classes = self._data_frame.select( dimension).distinct().count() self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures] data = self._data_frame.rdd.map( lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=6, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame, self._pandas_flag) self._new_tree = self.generate_new_tree(decision_tree) node_list = self.node_name_extractor(self._new_tree) node_list = list(self.flatten(node_list)) correct_count_list = [i[0] for i in self._count_list] tree_dict = dict(list(zip(node_list, correct_count_list))) #self._new_tree = self.generate_new_tree_total(decision_tree) self._new_tree = self.wrap_tree(self._new_tree, tree_dict) self._path_dict = self.path_dict_creator(node_list, self._new_tree) # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree) # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability, self._path_dict) decision_tree_result.set_target_map( self._mapping_dict[self._target_dimension], self._aggr_data, self._important_vars) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "dtreeTrainingEnd",\ # "info",\ # self._scriptStages["dtreeTrainingEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "dtreeTrainingEnd", "info", weightKey="script") # print decision_tree_result return decision_tree_result
from numpy import array from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark import SparkContext sc = SparkContext.getOrCreate() data = [ LabeledPoint(0.0, [0.0]), LabeledPoint(1.0, [1.0]), LabeledPoint(1.0, [2.0]), LabeledPoint(1.0, [3.0]) ] model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {}) print(model)
# <codecell> from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils # <codecell> data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt').cache() data.take(5) # <codecell> # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) # <codecell> predictions = model.predict(data.map(lambda x: x.features)) labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) labelsAndPredictions.take(10) # <codecell> trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(data.count()) print('Training Error = ' + str(trainErr)) print('Learned classification tree model:') print(model) # <codecell>