def _set_rddModel(self, _type, _SLA, data): if _type == 'regression': if _SLA == 'randomForest': self._rddModel = RandomForest.trainRegressor( data, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), featureSubsetStrategy=self.sparkOptions[5], impurity='variance', maxDepth=int(self.sparkOptions[1]), maxBins=32) else: self._rddModel = "" else: #classification if _SLA == 'randomForest': print self.numClasses self._rddModel = RandomForest.trainClassifier( data, numClasses=self.numClasses, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2]) else: self._rddModel = ""
def predict(training_data, test_data): # TODO: Train random forest classifier from given data # Result should be an RDD with the prediction of the random forest for each # test data point from pyspark.mllib.regression import LabeledPoint # Segregate the data into labels and features labeled_data = [] for x in training_data.collect(): labeled_data.append(LabeledPoint(x[len(x) - 1], x[0:len(x) - 1])) #labels.append(x[len(x)-1]) #features.append(x[0:len(x)-1]) #print(labeled_data) numClasses = 2 categoricalFeaturesInfo = {} numTrees = 4 featureSubsetStrategy = "auto" impurity = "gini" maxDepth = 6 maxBins = 32 seed = 12345 model = RandomForest.trainClassifier(sc.parallelize(labeled_data), numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed) predict_rdd = model.predict(test_data) #print(predict_rdd) return predict_rdd
def rfTest(sqlContext,dataset_rdd): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) train_positive = dataset_positive.sample(False,0.8) test_positive = dataset_positive.subtract(train_positive) train_negotive = dataset_negotive.sample(False,0.8) test_negotive = dataset_negotive.subtract(train_negotive) trainset_rdd = train_positive.union(train_negotive) testset_rdd = test_positive.union(test_negotive) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) trainset_nums = trainset.count() testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset_nums = testset.count() trainset_positive = train_positive.count() testset_positive = test_positive.count() model = RandomForest.trainClassifier(trainset,2,{},3) predictions = model.predict(testset.map(lambda x:x.features)) predict = testset.map(lambda lp: lp.label).zip(predictions) hitALL =predict.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() recallPositive = hitPositive/float(testset_positive) precision = hitPositive/float(positive) accuracy = hitALL/float(testset.count()) F_Value = 2/(1/precision+1/recallPositive) return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
def evaluateForest(rawData): data = rawData.map(unencodeOneHot) (trainData, cvData) = data.randomSplit(weights=[0.9, 0.1]) trainData.cache() cvData.cache() forest = RandomForest.trainClassifier(trainData, numClasses=7, categoricalFeaturesInfo={ 10: 4, 11: 40 }, numTrees=20, featureSubsetStrategy="auto", impurity="entropy", maxDepth=30, maxBins=300) metrics = getMetrics(forest, cvData) print(metrics.precision()) input = "2709,125,28,67,23,3224,253,207,61,6094,0,29" vector = Vectors.dense(map(lambda x: float(x), input.split(","))) print(forest.predict(vector)) trainData.unpersist() cvData.unpersist()
def trainevaluatemodel_tree(traindata,validationdata,numtrees,impurity,maxdepth,maxbins,seed): starttime=time() model=RandomForest.trainClassifier(traindata, numClasses=2, numTrees=numtrees,categoricalFeaturesInfo={},featureSubsetStrategy='auto', impurity=impurity, maxDepth=maxdepth, maxBins=maxbins,seed=seed) AUC=evaluation(model,validationdata) duration=time()-starttime print('Param:'+'\n'+'numtrees:'+str(numtrees)+'\n'+'impurity:'+str(impurity)+'\n'+'maxdepth:'+str(maxdepth)+'\n'+'maxbins:'+str(maxbins)+'\n'+'time:'+str(duration)+'\n'+'AUC:'+str(AUC)) return (numtrees,impurity,maxdepth,maxbins,duration,AUC)
def random_forest(): conf = SparkConf().setAppName('RF') sc = SparkContext(conf=conf) # print("\npyspark version:" + str(sc.version) + "\n") data = MLUtils.loadLibSVMFile(sc, './data/sample_libsvm_data.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v, p: v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, ".model/myRandomForestClassificationModel") sameModel = RandomForestModel.load( sc, "./model/myRandomForestClassificationModel")
def train(): data = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH) print("[INFO] load complete.") # 划分训练集 data = data.randomSplit([0.2, 0.8])[0] (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=NUM_OF_CLASSES, categoricalFeaturesInfo={}, numTrees=NUM_OF_TREES, featureSubsetStrategy="auto", impurity='gini', maxDepth=MAXDEPTH, maxBins=MAXBINS) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('[INFO] Test Error = ' + str(testErr)) print('[INFO] Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, TEST_MODEL_PATH) sameModel = RandomForestModel.load(sc, TEST_MODEL_PATH)
def generateRandomForest(): if os.path.exists(RF_PATH): print("RF_PATH Already available") return data = sc.textFile(F_PATH).map(parseLine) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L) # Train a RandomForest model. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={}, numTrees=4, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error', str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) modelStatistics(labelsAndPredictions) # Save and load model model.save(sc, RF_PATH) print("Saved RF Model.")
def main(): options = parse_args() sc = SparkContext(appName="PythonRandomForestClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, options.data_file) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model model.save(sc, options.output_model) sameModel = RandomForestModel.load(sc, options.output_model)
def testOnce (): # split the data into training and testing sets (trainingData, testData) = data.randomSplit([1-test_size, test_size]) # train the random forest model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy = strat, impurity='gini', maxDepth = max_depth, maxBins=32) # test the random forest predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count()) Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count()) Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count()) Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count()) probsAndScores = probTest(testData, model) threshold_accuracy = probsAndScores[0] probs = probsAndScores[1].map(lambda x: x/num_trees) labelsAndPredictions = labelsAndPredictions.zip(probs) labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs) save(labelsAndProbs, 'answers') print ('Galaxy Purity = ' + str(Ng / (Ng+Ms))) print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg))) print ('Star Purity = ' + str(Ns / (Ns+Mg))) print ('Star Completeness = ' + str(Ns/(Ns+Ms))) print ('Accuracy = ' + str(1 - testErr)) print ('Threshold method accuracy = ' + str(threshold_accuracy))
def Random_Forest(filename, sc): filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt" # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, filename) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString()) # Save and load model #model.save(sc, "target/tmp/myRandomForestClassificationModel") #sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
def createRFC(rdd, numclasses, catfeatinfo): # overfishing tune later return RandomForest.trainClassifier(rdd, numClasses=numclasses, categoricalFeaturesInfo=catfeatinfo, numTrees=3)
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Truncate the last 2 features of the data for dataPoint in train_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) for dataPoint in test_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42) # Train the models randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, numTrees=750, seed=42, maxDepth=30, maxBins=32) # Test the model testRandomForest(randomForestModel, parallelized_test_set)
def createRandomForest(sparkDF, NUMTREES, NUMCLASSES): # =========================== # douglas fletcher # purpose: create random # forest model # input: # spark type sparkSession # sparkDF type sparkDF # output: # =========================== # create labelled point rdd data = sparkDF.rdd.map( lambda row: LabeledPoint(row["SeriousDlqin2yrs"], list(row[2:])) ) # Unknown Bug fix val = data.collect()[1] # create random forest model model = RandomForest.trainClassifier( data , numTrees = NUMTREES , numClasses = NUMCLASSES #, maxDepth = MAXDEPTH , impurity='gini' , featureSubsetStrategy="auto" , categoricalFeaturesInfo={} , seed=42 , maxBins=32 )
def train(self, training_data): return RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=6, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32)
def main(): # Reading from the hdfs, removing the header # read the titanic train, test csv here trainTitanic = sc.textFile( srcDir + "titanic_train.csv") # remove the header trainHeader = trainTitanic.first() trainTitanic = trainTitanic.filter(lambda line: line != trainHeader).mapPartitions(lambda x: csv.reader(x)) trainTitanic.first() # Data Transformations and filter lines with empty strings trainTitanic=trainTitanic.map(lambda line: line[1:3]+sexTransformMapper(line[4])+line[5:11]) trainTitanic=trainTitanic.filter(lambda line: line[3] != '' ).filter(lambda line: line[4] != '' ) trainTitanic.take(10) # creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])" trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]])) trainTitanicLP.first() # splitting dataset into train and test set # 70% train, 30% test (trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3]) # Random forest : same parameters as sklearn (?) from pyspark.mllib.tree import RandomForest time_start=time.time() model_rf = RandomForest.trainClassifier(trainData, numClasses = 2, categoricalFeaturesInfo = {}, numTrees = 100, featureSubsetStrategy='auto', impurity='gini', maxDepth=12, maxBins=32, seed=None) model_rf.numTrees() model_rf.totalNumNodes() time_end=time.time() time_rf=(time_end - time_start) print("RF takes %d s" %(time_rf)) # Predictions on test set predictions = model_rf.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # first metrics from pyspark.mllib.evaluation import BinaryClassificationMetrics metrics = BinaryClassificationMetrics(labelsAndPredictions) print ('=====================================================') print (' output : ') # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) print ('=====================================================')
def trainAndSave(filename='RFmodel' + str(num_trees) + strat + str(max_depth)): model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy=strat, impurity='gini', maxDepth=max_depth, maxBins=32) model.save(sc, filename)
def malware_predict_and_store(sc, training_set, X_test, num_of_trees, depth): classifier_model = RandomForest.trainClassifier( sc.parallelize(training_set), 9, {}, num_of_trees, maxDepth=depth) result = [] for index in X_test: result.append(int(classifier_model.predict(index) + 1)) name = 'result.txt' df = pd.DataFrame(result) df.to_csv(name, header=False, index=False)
def trainEvaluateModel(trainData): model = RandomForest.trainClassifier(trainData, 2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=12, maxBins=32) return model
def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\ depth=4): """ 训练模型 """ model = RandomForest.trainClassifier(trainData, numClasses=2,\ categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \ featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\ maxBins=32) return model
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def evaluate(self, trainingData, testData=None, metric=None): if testData !=None: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) else: #cross validation pass
def predict(training_data, test_data): # TODO: Train random forest classifier from given data # Result should be an RDD with the prediction of the random forest for each # test data point model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \ numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \ maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED) return model.predict(test_data)
def trainModel(trainingData): print "\nTrainning Random Forest model started!" Utils.logTime() model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) print '\nTraining Random Forest model finished' Utils.logTime() return model
def predict(training_data, test_data): # TODO: Train random forest classifier from given data # Result should be an RDD with the prediction of the random forest for each # test data point model = RandomForest.trainClassifier(training_data, numClasses=2, \ categoricalFeaturesInfo={}, \ numTrees=250, featureSubsetStrategy="auto", impurity="gini", \ maxDepth=6, seed=0) predictions = model.predict(test_data) return predictions
def kfolds(): #folds = kFold(data, k) this would work in java acc = 0 spurity = 0 scomp = 0 gpurity = 0 gcomp = 0 foldsize = data.count() / k tested = sc.parallelize([]) for i in range(k): test = sc.parallelize( data.subtract(tested).takeSample(False, foldsize)) tested = tested.union(test) train = data.subtract(test) # train the random forest model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy="auto", impurity='gini', maxDepth=max_depth, maxBins=32) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(test.count()) Mg = float( labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count()) Ng = float( labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count()) Ms = float( labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count()) Ns = float( labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count()) gpurity += (Ng / (Ng + Ms)) gcomp += (Ng / (Ng + Mg)) spurity += (Ns / (Ns + Mg)) scomp += (Ns / (Ns + Ms)) acc += (1 - testErr) print 'with ' + str(k) + ' folds:' print('Average Galaxy Purity = ' + str(gpurity / k)) print('Average Galaxy Completeness = ' + str(gcomp / k)) print('Average Star Purity = ' + str(spurity / k)) print('Average Star Completeness = ' + str(scomp / k)) print('Average Accuracy = ' + str(acc / k))
def RF_train(data, filename): data_train = split_data(data) key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1])) training, test = key_FT.randomSplit([0.8, 0.2], 0) model_RF = RandomForest.trainClassifier(training, 2, {}, 5, seed=42) predictionAndlabel = test.map( lambda x: (float(model_RF.predict(x.features)), x.label)) accuracy = 1.0 * predictionAndlabel.filter( lambda (x, v): x == v).count() / test.count() print("accuracy of model_RF:%f" % accuracy) pre_all(data, model_RF, filename) return model_RF, accuracy
def train_random_forest(train_rdd): # Build Model model = RandomForest.trainClassifier(train_rdd, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=9, maxBins=32, seed=42) return model
def trainOptimalModel(trainingData, testData): print "\nTraining optimal Random Forest model started!" Utils.logTime() numTreesVals = [3,5,8] featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird'] impurityVals = ['gini', 'entropy'] maxDepthVals = [3,4,5,6,7] maxBinsVals = [8,16,32] optimalModel = None optimalNumTrees = None optimalFeatureSubsetStrategy = None optimalMaxDepth = None optimalImpurity = None optimalBinsVal = None minError = None try: for curNumTree in numTreesVals: for curFeatureSubsetStrategy in featureSubsetStrategyVals: for curImpurity in impurityVals: for curMaxDepth in maxDepthVals: for curMaxBins in maxBinsVals: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=curNumTree, featureSubsetStrategy=curFeatureSubsetStrategy, impurity=curImpurity, maxDepth=curMaxDepth, maxBins=curMaxBins) testErr = Evaluation.evaluate(model, testData) if testErr < minError or not minError: minError = testErr optimalNumTrees = curNumTree optimalFeatureSubsetStrategy = curFeatureSubsetStrategy optimalImpurity = curImpurity optimalMaxDepth = curMaxDepth optimalBinsVal = curMaxBins optimalModel = model except: msg = "\nException during model training with below parameters:" msg += "\tnum trees: " + str(optimalNumTrees) msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy msg += "\timpurity: " + str(curImpurity) msg += "\tmaxDepth: " + str(curMaxDepth) msg += "\tmaxBins: " + str(curMaxBins) Utls.logMessage(msg) logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError) return optimalModel
def predict_and_save(sc, train_data, X_test, num_trees, max_depth): '''Trains a Random Forest classifier. Loops over different values of trees and max depth. ''' model = RandomForest.trainClassifier(sc.parallelize(train_data), 9, {}, num_trees, maxDepth = max_depth) a = [] for i in X_test: a.append(int(model.predict(i) + 1)) trees_s = str(num_trees) depth_s = str(max_depth) string = 'submit' + trees_s + depth_s + '.txt' b = pd.DataFrame(a) b.to_csv(string, header = False, index = False)
def testClassification(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2,categoricalFeaturesInfo={},numTrees=3, featureSubsetStrategy="auto",impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()/float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString())
def test_impurity(n): model1 = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={}, numTrees=n, featureSubsetStrategy="auto", impurity='entropy', maxDepth=5, maxBins=32) predictions1 = model1.predict(testData.map(lambda x: x.features)) labelsAndPredictions1 = testData.map(lambda lp: lp.label).zip(predictions1) testErr1 = labelsAndPredictions1.filter( lambda (v, p): v != p).count() / float(testData.count()) return testErr1
def random_forest(self, train_sample, test_sample, impurity, num_trees): #import pdb; pdb.set_trace() rf_model = RandomForest.trainClassifier(train_sample, numClasses=2, categoricalFeaturesInfo={}, numTrees=int(num_trees), featureSubsetStrategy="auto", impurity=impurity, maxDepth=5, seed=123) # Cross-validate on the model return self.cross_validate(rf_model, test_sample)
def main(): ## Data iris = datasets.load_iris() X = iris.data Y = iris.target data = map(lambda (x, y): LabeledPoint(y, x), zip(X, Y)) ## Modelling model = RandomForest.trainClassifier(sc.parallelize(data), 3, {}, 3, seed=42) ## Prediction preds = [model.predict(_) for _ in X] ## Accuracy print(sum(preds == Y) * 1.0 / len(Y))
def get_rf_model(sc, train=None): model_path = 'rf.model' if train is None: model = RandomForestModel.load(sc, model_path) else: model = RandomForest.trainClassifier(train, numClasses=2, numTrees=10, categoricalFeaturesInfo={}, featureSubsetStrategy="auto", impurity='gini', maxDepth=10, maxBins=100) model.save(sc, model_path) return model
def predictions_RF(train_data_labeled,test_data_labeled,RF_NUM_TREES): time_start=time.time() model_rf = RandomForest.trainClassifier(train_data_labeled, numClasses=10, categoricalFeaturesInfo={}, numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", maxDepth=10, maxBins=32, seed=10) predictions = model_rf.predict(test_data_labeled.map(lambda x: x.features)) predict_label = test_data_labeled.map(lambda x: x.label).repartition(1).saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/stage3") labels_and_predictions = test_data_labeled.map(lambda x: x.label).zip(predictions) rfAccuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data_labeled.count()) time_end=time.time() time_rf=(time_end - time_start) print("=========================================================================================================") print("run time: {},RandomForest accuracy: {}".format(time_rf,rfAccuracy))
def testClassification(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\ / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification forest model:') print(model.toDebugString())
def train_trend_model(self, model, data, i): self.logger.info('Start to train the direction model') rdd_data = self.sc.parallelize(data) if self.trend_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='gini', maxDepth=20, maxBins=32) elif self.trend_prediction_method == self.NAIVE_BAYES: model = NaiveBayes.train(rdd_data) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) elif self.trend_prediction_method == self.SVM: model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=None if model is None else model.weights) return model
def create_model(name, training): if name == 'logistic': print_box() print "Logistic Regression Model" print_box() model = LogisticRegressionWithLBFGS.train(training) elif name == 'tree': print_box() print "Decision Tree Model" print_box() model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) elif name == 'rf': print_box() print "Random Forest Model" print_box() model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50) return model
def kfolds (): #folds = kFold(data, k) this would work in java acc = 0 spurity = 0 scomp = 0 gpurity = 0 gcomp = 0 foldsize = data.count()/k tested = sc.parallelize([]) for i in range(k): test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize)) tested = tested.union(test) train = data.subtract(test) # train the random forest model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, numTrees=num_trees, featureSubsetStrategy="auto", impurity='gini', maxDepth = max_depth, maxBins=32) predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count()) Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count()) Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count()) Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count()) gpurity += (Ng / (Ng+Ms)) gcomp += (Ng / (Ng+Mg)) spurity += (Ns / (Ns+Mg)) scomp += (Ns/(Ns+Ms)) acc += (1 - testErr) print 'with '+ str(k) + ' folds:' print ('Average Galaxy Purity = ' + str(gpurity / k)) print ('Average Galaxy Completeness = ' + str(gcomp / k)) print ('Average Star Purity = ' + str(spurity / k)) print ('Average Star Completeness = ' + str(scomp / k)) print ('Average Accuracy = ' + str(acc / k))
return LabeledPoint(label, features) ml_data = sc.textFile('final_output_final.csv').map(parseLine) (trainingData, testData) = ml_data.randomSplit([0.7, 0.3]) categoricalFeaturesInfo = {0:len(set(cols[agency_index])), 1:len(set(cols[comp_index])), 2:len(set(cols[loc_index])), 3:len(set(cols[incident_index])), 4:len(set(cols[add_index])), 5:len(set(cols[city_index])), 6:len(set(cols[fac_index])), 7:len(set(cols[status_index])), 8:len(set(cols[bor_index]))} numClasses = 2 numTrees = 3 featureSubsetStrategy="auto" impurity='gini' maxDepth=20 maxBins= max(categoricalFeaturesInfo.values()) + 10 model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) model.save(sc, "modelCritical") predictions = model.predict(testData.map(lambda x: x.features)) #print "HERE I AM" #print type(predictions.collect()) temp = predictions.collect() print temp labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count()) print('Accuracy = ' + str(testErr)) ''' predictList = []
# MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/spam_docvecs") # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. rr = RandomForest.trainClassifier( trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, ) predictions = rr.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) posErr = ( float(labelsAndPredictions.filter(lambda (v, p): v == 0.0 and v != p).count()) / testData.filter(lambda lp: lp.label == 0.0).count() ) negErr = ( float(labelsAndPredictions.filter(lambda (v, p): v == 1.0 and v != p).count()) / testData.filter(lambda lp: lp.label == 1.0).count() )
# categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57) # data.cache() # mappings = [get_mapping(data, i) for i in categorical] labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1])) return labelpoints data = label_points(data_raw) training, testing = data.randomSplit([0.5, 0.5], 0) model = RandomForest.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={}, numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxBins=32) predictions = model.predict(testing.map(lambda x: x.features)) labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions) accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testing.count()) print accuracy # # # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false # # Page 166 # def get_mapping(rdd, idx): # return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap() # # # cat_len = sum(map(len, mappings))
import pyspark from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils sc = pyspark.SparkContext(appName="RandomForest") # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/PartyVectors/') # Split the data into training and test sets trainingData, testData = data.randomSplit([0.7, 0.3]) trainingData.cache() # The depth of the tree proved to be a significant bottle neck model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={}, numTrees=700, featureSubsetStrategy="auto", impurity='gini', maxDepth=8, maxBins=12) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print("") print("") print('Test Error: ' + str(testErr))
trainTitanic.take(10) # creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])" trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]])) trainTitanicLP.first() # splitting dataset into train and test set # 70% train, 30% test (trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3]) # Random forest : same parameters as sklearn (?) from pyspark.mllib.tree import RandomForest time_start=time.time() model_rf = RandomForest.trainClassifier(trainData, numClasses = 2, categoricalFeaturesInfo = {}, numTrees = 100, featureSubsetStrategy='auto', impurity='gini', maxDepth=12, maxBins=32, seed=None) model_rf.numTrees() model_rf.totalNumNodes() time_end=time.time() time_rf=(time_end - time_start) print("RF takes %d s" %(time_rf)) # Predictions on test set predictions = model_rf.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # first metrics from pyspark.mllib.evaluation import BinaryClassificationMetrics
def train(self, trainingData): self.model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
test_lp_arr = [] sample_data = all_data[train_indexes] test_data = all_data[test_indexes] for survived, record in sample_data: lp = LabeledPoint(survived, tuple(record)) lparr.append(lp) for survived, record in test_data: lp = LabeledPoint(survived, tuple(record)) test_lp_arr.append(lp) training_data = sc.parallelize(lparr).cache() test_data_rdd = sc.parallelize(test_lp_arr).cache() classificationModel = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3) result = classificationModel.predict(test_data_rdd.map(lambda x: x.features)) print classificationModel print classificationModel.toDebugString() print "===============================" predicted_data = result.collect() actual_data = test_data_rdd.map(lambda x: float(x.label)).collect() print mean_absolute_error(actual_data, predicted_data) print accuracy_score(actual_data,predicted_data) print(classificationModel) #for p in predicted_data: # print p break
dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, # categoricalFeaturesInfo=categoricalFeaturesInfo) model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, ) # Print learned tree and stats. print origToNewLabels print "Trained DecisionTree for classification:" # print " Model numNodes: %d" % model.numNodes() # print " Model depth: %d" % model.depth() print " Training accuracy: %g" % getAccuracy(model, reindexedData) # if model.numNodes() < 20: # print model.toDebugString() # else: # print model print model # testdata = MLUtils.loadLibSVMFile(sc, 'test_svm') #reuben predictions = model.predict(testdata.map(lambda x: x.features)) # labels = testdata.map(lambda l:l.label)
data = sc.textFile("team_result.txt") data = data.map(lambda line: line.split(",")) data = data.map(lambda x: LabeledPoint(float(x[5]), [x[0], x[1], x[2], x[3], x[4]])) # Split the dataset into training set (70%) and test set (30%) trainingData, testData = data.randomSplit([0.7, 0.3], seed=1071) # Create and train the naive Bayes model naiveBayesModel = NaiveBayes.train(trainingData, 1.0) # Apply the model to the test set predictionAndLabelNaiveBayes = testData.map(lambda x: (naiveBayesModel.predict(x.features), x.label)) # Calculate the accuracy of the model errorNaiveBayes = 1.0 * predictionAndLabelNaiveBayes.filter(lambda (x, y): x != y).count() / testData.count() print "Naive Bayes model classification error: {0:f}".format(errorNaiveBayes) # Create and train the random forest model randomForestModel = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={0: 9, 1: 9, 2: 9, 3: 9, 4: 9}, numTrees=3, impurity="gini", maxDepth=4, maxBins=32, seed=1071) ''' Note taken from the official API documentation: In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. ''' predictionsRandomForest = randomForestModel.predict(testData.map(lambda x: x.features)) labelsAndPredictionsRF = testData.map(lambda x: x.label).zip(predictionsRandomForest) errorRandomForest = labelsAndPredictionsRF.filter(lambda (x, y): x != y).count() / float(testData.count()) print "Random forest classification error: {0:f}".format(errorRandomForest)
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
# ### Reducing data size # In[31]: Data1=Data.sample(False,0.1, seed=255).cache() (trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255) trainingData.cache() testData.cache() # ### RANDOM from pyspark.mllib.tree import RandomForest, RandomForestModel errors={} for depth in [16]: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=18, maxDepth=depth) errors[depth]={} dataSets={'train':trainingData,'test':testData} for name in dataSets.keys(): # Calculate errors on train and test sets data=dataSets[name] Predicted = model.predict(data.map(lambda x: x.features)) LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted) Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count()) errors[depth][name]=Err print depth,errors[depth] errors={} for depth in [18]: model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, maxDepth=depth) errors[depth]={}
numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1 labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2]))) unbalance_test = data_ans_0827.map(feature_char_to_num).cache() l_unbal_te = unbalance_test.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) #splite data to trainData and testData (trianData, testData) = labeled_data.randomSplit([0.9, 0.1]) len_list = [len(i) for i in fe] col_na_l = [i-1 for i in col_na] #because slice out the first data in vector [1:-2] col_na_l = [i-1 for i in col_na_l if i >= 83] #for drop out the 85th col features_dict = dict(zip(col_na_l, len_list)) #feature dict eg. {1:3, 5:8} model = RandomForest.trainClassifier(trianData, numClasses=2, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(l_unbal_te.map(lambda x: x.features)) labelsAndPredictions = l_unbal_te.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(l_unbal_te.count()) print('Test Error = ' + str(testErr) +"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") true_positive = labelsAndPredictions.filter(lambda (v,p):v==p and p==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==1).count()) print "true_positive", true_positive, "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" f_true = labelsAndPredictions.filter(lambda (v,p):v==p and v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):p==1).count()) print "precision=TP/(TP+Fp)", f_true, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" print "1/0",labelsAndPredictions.filter(lambda (v,p):v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==0).count()), "##############################################################################################" #print "False", labeled_data.filter(lambda p:p.label==0).count(), "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2" #print "Positive", labeled_data.filter(lambda p:p.label==1).count(),"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
def trainRandomForest(data): return RandomForest.trainClassifier(data, numClasses=9, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='gini', maxDepth=30, maxBins=32)
#trainingData = trainRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],[k.strip() for k in x.split(",") if k][1:4])) #testData = testRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],[k.strip() for k in x.split(",") if k][1:4])) # new transformed dataset: nxtLoc <-- Day+ CurrLoc+ NxtTimeInt trainingData = trainRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],list([k.strip() for k in x.split(",") if k][1:2])+[k.strip() for k in x.split(",") if k][3:5])) testData = testRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],list([k.strip() for k in x.split(",") if k][1:2])+[k.strip() for k in x.split(",") if k][3:5])) #model = DecisionTree.trainClassifier(trainingData, numClasses=400, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=10, maxBins=32) #ModelDict[name] = model # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainClassifier(trainingData, numClasses=400, categoricalFeaturesInfo={},numTrees=17, featureSubsetStrategy="auto", impurity='entropy', maxDepth=9, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) avg = avg+testErr print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) errorPredictions.write("Region ID: "+str(name)+" Test Error = : "+str(testErr)+" Time taken:"+str((time.time() - start_time))+" trainRDD count: "+str(trainRDD.count())+" testRDD count: "+ str(testRDD.count())+'\n') errorPredictionsTree.write("Region ID: "+str(name)+" Test Error rate: "+str(testErr)+'\n'+"Model:"+"\n"+model.toDebugString()) print "Average error rate: "+str(avg/count) errorPredictions.write("Average error rate: "+str(avg/count))