예제 #1
0
 def _set_rddModel(self, _type, _SLA, data):
     if _type == 'regression':
         if _SLA == 'randomForest':
             self._rddModel = RandomForest.trainRegressor(
                 data,
                 categoricalFeaturesInfo={},
                 numTrees=int(self.sparkOptions[4]),
                 featureSubsetStrategy=self.sparkOptions[5],
                 impurity='variance',
                 maxDepth=int(self.sparkOptions[1]),
                 maxBins=32)
         else:
             self._rddModel = ""
     else:  #classification
         if _SLA == 'randomForest':
             print self.numClasses
             self._rddModel = RandomForest.trainClassifier(
                 data,
                 numClasses=self.numClasses,
                 categoricalFeaturesInfo={},
                 numTrees=int(self.sparkOptions[4]),
                 maxDepth=int(self.sparkOptions[1]),
                 featureSubsetStrategy=self.sparkOptions[5],
                 impurity=self.sparkOptions[2])
         else:
             self._rddModel = ""
예제 #2
0
def predict(training_data, test_data):
    # TODO: Train random forest classifier from given data
    # Result should be an RDD with the prediction of the random forest for each
    # test data point

    from pyspark.mllib.regression import LabeledPoint
    # Segregate the data into labels and features
    labeled_data = []
    for x in training_data.collect():
        labeled_data.append(LabeledPoint(x[len(x) - 1], x[0:len(x) - 1]))
        #labels.append(x[len(x)-1])
        #features.append(x[0:len(x)-1])

    #print(labeled_data)

    numClasses = 2
    categoricalFeaturesInfo = {}
    numTrees = 4
    featureSubsetStrategy = "auto"
    impurity = "gini"
    maxDepth = 6
    maxBins = 32
    seed = 12345

    model = RandomForest.trainClassifier(sc.parallelize(labeled_data),
                                         numClasses, categoricalFeaturesInfo,
                                         numTrees, featureSubsetStrategy,
                                         impurity, maxDepth, maxBins, seed)

    predict_rdd = model.predict(test_data)
    #print(predict_rdd)

    return predict_rdd
예제 #3
0
def rfTest(sqlContext,dataset_rdd):
	dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
	dataset_negotive =  dataset_rdd.filter(lambda e:e[1]<0.5)
	train_positive = dataset_positive.sample(False,0.8)
	test_positive = dataset_positive.subtract(train_positive)
	train_negotive = dataset_negotive.sample(False,0.8)
	test_negotive = dataset_negotive.subtract(train_negotive)
	trainset_rdd = train_positive.union(train_negotive)
	testset_rdd = test_positive.union(test_negotive)
	trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	trainset_nums = trainset.count()
	testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	testset_nums = testset.count()
	trainset_positive = train_positive.count()
	testset_positive = test_positive.count()
	model = RandomForest.trainClassifier(trainset,2,{},3)
	predictions = model.predict(testset.map(lambda x:x.features))
	predict = testset.map(lambda lp: lp.label).zip(predictions)
	hitALL =predict.filter(lambda e:e[0]==e[1]).count()
	hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
	positive = predict.filter(lambda e:e[1]>0.5).count()
	recallPositive = hitPositive/float(testset_positive)
	precision = hitPositive/float(positive)
	accuracy = hitALL/float(testset.count())
	F_Value = 2/(1/precision+1/recallPositive)
	return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
def evaluateForest(rawData):
    data = rawData.map(unencodeOneHot)
    (trainData, cvData) = data.randomSplit(weights=[0.9, 0.1])

    trainData.cache()
    cvData.cache()

    forest = RandomForest.trainClassifier(trainData,
                                          numClasses=7,
                                          categoricalFeaturesInfo={
                                              10: 4,
                                              11: 40
                                          },
                                          numTrees=20,
                                          featureSubsetStrategy="auto",
                                          impurity="entropy",
                                          maxDepth=30,
                                          maxBins=300)

    metrics = getMetrics(forest, cvData)

    print(metrics.precision())

    input = "2709,125,28,67,23,3224,253,207,61,6094,0,29"
    vector = Vectors.dense(map(lambda x: float(x), input.split(",")))
    print(forest.predict(vector))

    trainData.unpersist()
    cvData.unpersist()
예제 #5
0
def trainevaluatemodel_tree(traindata,validationdata,numtrees,impurity,maxdepth,maxbins,seed):
    starttime=time()
    model=RandomForest.trainClassifier(traindata, numClasses=2, numTrees=numtrees,categoricalFeaturesInfo={},featureSubsetStrategy='auto', impurity=impurity, maxDepth=maxdepth, maxBins=maxbins,seed=seed)
    AUC=evaluation(model,validationdata)
    duration=time()-starttime
    print('Param:'+'\n'+'numtrees:'+str(numtrees)+'\n'+'impurity:'+str(impurity)+'\n'+'maxdepth:'+str(maxdepth)+'\n'+'maxbins:'+str(maxbins)+'\n'+'time:'+str(duration)+'\n'+'AUC:'+str(AUC))
    return (numtrees,impurity,maxdepth,maxbins,duration,AUC)
예제 #6
0
def random_forest():
    conf = SparkConf().setAppName('RF')
    sc = SparkContext(conf=conf)
    # print("\npyspark version:" + str(sc.version) + "\n")

    data = MLUtils.loadLibSVMFile(sc, './data/sample_libsvm_data.txt')
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda v, p: v != p).count() / float(
        testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    # Save and load model
    model.save(sc, ".model/myRandomForestClassificationModel")
    sameModel = RandomForestModel.load(
        sc, "./model/myRandomForestClassificationModel")
def train():
    data = MLUtils.loadLibSVMFile(sc, TEST_DATA_PATH)
    print("[INFO] load complete.")
    # 划分训练集
    data = data.randomSplit([0.2, 0.8])[0]
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=NUM_OF_CLASSES,
                                         categoricalFeaturesInfo={},
                                         numTrees=NUM_OF_TREES,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=MAXDEPTH,
                                         maxBins=MAXBINS)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('[INFO] Test Error = ' + str(testErr))
    print('[INFO] Learned classification forest model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, TEST_MODEL_PATH)
    sameModel = RandomForestModel.load(sc, TEST_MODEL_PATH)
예제 #8
0
def generateRandomForest():
    if os.path.exists(RF_PATH):
        print("RF_PATH Already available")
        return

    data = sc.textFile(F_PATH).map(parseLine)

    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)

    # Train a RandomForest model.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
                                         numTrees=4, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error', str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())

    modelStatistics(labelsAndPredictions)

    # Save and load model
    model.save(sc, RF_PATH)
    print("Saved RF Model.")
예제 #9
0
def main():
    options = parse_args()

    sc = SparkContext(appName="PythonRandomForestClassificationExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, options.data_file)
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=4,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, options.output_model)
    sameModel = RandomForestModel.load(sc, options.output_model)
예제 #10
0
def testOnce ():
    # split the data into training and testing sets
    (trainingData, testData) = data.randomSplit([1-test_size, test_size])

    # train the random forest
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=num_trees, featureSubsetStrategy = strat,
                                     impurity='gini', maxDepth = max_depth, maxBins=32)

    # test the random forest
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
    Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
    Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
    Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
    probsAndScores = probTest(testData, model)
    threshold_accuracy = probsAndScores[0]
    probs = probsAndScores[1].map(lambda x: x/num_trees)
    labelsAndPredictions = labelsAndPredictions.zip(probs)
    labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs)
    save(labelsAndProbs, 'answers')
    print ('Galaxy Purity = ' + str(Ng / (Ng+Ms)))
    print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg)))
    print ('Star Purity = ' + str(Ns / (Ns+Mg)))
    print ('Star Completeness = ' + str(Ns/(Ns+Ms)))
    print ('Accuracy = ' + str(1 - testErr))
    print ('Threshold method accuracy = ' + str(threshold_accuracy))
예제 #11
0
def Random_Forest(filename, sc):

	filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
	# Load and parse the data file into an RDD of LabeledPoint.
	data = MLUtils.loadLibSVMFile(sc, filename)
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a RandomForest model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous.
	#  Note: Use larger numTrees in practice.
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
	                                     numTrees=3, featureSubsetStrategy="auto",
	                                     impurity='gini', maxDepth=4, maxBins=32)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification forest model:')
	print(model.toDebugString())

	# Save and load model
	#model.save(sc, "target/tmp/myRandomForestClassificationModel")
	#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
예제 #12
0
def createRFC(rdd, numclasses, catfeatinfo):

    # overfishing tune later
    return RandomForest.trainClassifier(rdd,
                                        numClasses=numclasses,
                                        categoricalFeaturesInfo=catfeatinfo,
                                        numTrees=3)
예제 #13
0
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Truncate the last 2 features of the data
    for dataPoint in train_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    for dataPoint in test_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)

    # Train the models
    randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
                                         numTrees=750, seed=42, maxDepth=30, maxBins=32)

    # Test the model
    testRandomForest(randomForestModel, parallelized_test_set)
예제 #14
0
def createRandomForest(sparkDF, NUMTREES, NUMCLASSES):
	# ===========================
	# douglas fletcher
	# purpose: create random 
	# forest model
	# input: 
	# 	spark type sparkSession
	# 	sparkDF type sparkDF
	# output: 
	# ===========================
	# create labelled point rdd
	data = sparkDF.rdd.map(
		lambda row: LabeledPoint(row["SeriousDlqin2yrs"], list(row[2:]))
	)
	# Unknown Bug fix
	val = data.collect()[1]
	# create random forest model
	model = RandomForest.trainClassifier(
		  data
		, numTrees = NUMTREES
		, numClasses = NUMCLASSES
		#, maxDepth = MAXDEPTH
		, impurity='gini'
		, featureSubsetStrategy="auto"
		, categoricalFeaturesInfo={}
		, seed=42
		, maxBins=32
	)
예제 #15
0
 def train(self, training_data):
     return RandomForest.trainClassifier(training_data,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=6,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)
예제 #16
0
def main():
    # Reading from the hdfs, removing the header
    # read the titanic train, test csv here 
    trainTitanic = sc.textFile( srcDir + "titanic_train.csv")
    # remove the header 
    trainHeader = trainTitanic.first()
    trainTitanic = trainTitanic.filter(lambda line: line != trainHeader).mapPartitions(lambda x: csv.reader(x))
    trainTitanic.first()
     
     
    # Data Transformations and filter lines with empty strings
    trainTitanic=trainTitanic.map(lambda line: line[1:3]+sexTransformMapper(line[4])+line[5:11])
    trainTitanic=trainTitanic.filter(lambda line: line[3] != '' ).filter(lambda line: line[4] != '' )
    trainTitanic.take(10)
     
    # creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])"
    trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]]))
    trainTitanicLP.first()
     
    # splitting dataset into train and test set
    # 70% train, 30% test 
    (trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3])
     
    # Random forest : same parameters as sklearn (?)
    from pyspark.mllib.tree import RandomForest
     
    time_start=time.time()
    model_rf = RandomForest.trainClassifier(trainData, numClasses = 2,
            categoricalFeaturesInfo = {}, numTrees = 100,
            featureSubsetStrategy='auto', impurity='gini', maxDepth=12,
            maxBins=32, seed=None)
     
      
    model_rf.numTrees()
    model_rf.totalNumNodes()
    time_end=time.time()
    time_rf=(time_end - time_start)
    print("RF takes %d s" %(time_rf))
     
    # Predictions on test set
    predictions = model_rf.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
     
    # first metrics
    from pyspark.mllib.evaluation import BinaryClassificationMetrics
    metrics = BinaryClassificationMetrics(labelsAndPredictions)
     
    print ('=====================================================')
    print (' output : ')

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)
     
    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)

    print ('=====================================================')
예제 #17
0
def trainAndSave(filename='RFmodel' + str(num_trees) + strat + str(max_depth)):
    model = RandomForest.trainClassifier(data,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=num_trees,
                                         featureSubsetStrategy=strat,
                                         impurity='gini',
                                         maxDepth=max_depth,
                                         maxBins=32)
    model.save(sc, filename)
예제 #18
0
def malware_predict_and_store(sc, training_set, X_test, num_of_trees, depth):

    classifier_model = RandomForest.trainClassifier(
        sc.parallelize(training_set), 9, {}, num_of_trees, maxDepth=depth)
    result = []
    for index in X_test:
        result.append(int(classifier_model.predict(index) + 1))
    name = 'result.txt'
    df = pd.DataFrame(result)
    df.to_csv(name, header=False, index=False)
예제 #19
0
def trainEvaluateModel(trainData):
    model = RandomForest.trainClassifier(trainData,
                                         2,
                                         categoricalFeaturesInfo={},
                                         numTrees=15,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=12,
                                         maxBins=32)
    return model
 def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\
     depth=4):
     """
     训练模型
     """
     model = RandomForest.trainClassifier(trainData, numClasses=2,\
         categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \
         featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\
         maxBins=32)
     return model
 def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\
     depth=4):
     """
     训练模型
     """
     model = RandomForest.trainClassifier(trainData, numClasses=2,\
         categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \
         featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\
         maxBins=32)
     return model
예제 #22
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
예제 #23
0
파일: RF.py 프로젝트: bngonmang/FIND
 def evaluate(self, trainingData,  testData=None, metric=None):
     if testData !=None:
         model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                  numTrees=10, featureSubsetStrategy="auto",
                                  impurity='gini', maxDepth=4, maxBins=32)
         predictions = model.predict(testData.map(lambda x: x.features))
         labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
         testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
         print('Test Error = ' + str(testErr))
     else: #cross validation
         pass
예제 #24
0
def predict(training_data, test_data):
    # TODO: Train random forest classifier from given data
    # Result should be an RDD with the prediction of the random forest for each
    # test data point



    model = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)
    return model.predict(test_data)
예제 #25
0
def trainModel(trainingData):
	print "\nTrainning Random Forest model started!"
	Utils.logTime()

	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, 
											numTrees=3, featureSubsetStrategy="auto", impurity='gini',
											maxDepth=5, maxBins=32)

	print '\nTraining Random Forest model finished'
	Utils.logTime()
	return model
예제 #26
0
def predict(training_data, test_data):
    # TODO: Train random forest classifier from given data
    # Result should be an RDD with the prediction of the random forest for each
    # test data point
    model = RandomForest.trainClassifier(training_data, numClasses=2, \
        categoricalFeaturesInfo={}, \
    numTrees=250, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=6,  seed=0)
    predictions = model.predict(test_data)

    return predictions
예제 #27
0
def kfolds():
    #folds = kFold(data, k) this would work in java
    acc = 0
    spurity = 0
    scomp = 0
    gpurity = 0
    gcomp = 0
    foldsize = data.count() / k
    tested = sc.parallelize([])
    for i in range(k):
        test = sc.parallelize(
            data.subtract(tested).takeSample(False, foldsize))
        tested = tested.union(test)
        train = data.subtract(test)
        # train the random forest
        model = RandomForest.trainClassifier(train,
                                             numClasses=2,
                                             categoricalFeaturesInfo={},
                                             numTrees=num_trees,
                                             featureSubsetStrategy="auto",
                                             impurity='gini',
                                             maxDepth=max_depth,
                                             maxBins=32)

        predictions = model.predict(test.map(lambda x: x.features))
        labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
        testErr = labelsAndPredictions.filter(
            lambda (v, p): v != p).count() / float(test.count())
        Mg = float(
            labelsAndPredictions.filter(lambda
                                        (v, p): v == 0 and p == 1).count())
        Ng = float(
            labelsAndPredictions.filter(lambda
                                        (v, p): v == 0 and p == 0).count())
        Ms = float(
            labelsAndPredictions.filter(lambda
                                        (v, p): v == 1 and p == 0).count())
        Ns = float(
            labelsAndPredictions.filter(lambda
                                        (v, p): v == 1 and p == 1).count())

        gpurity += (Ng / (Ng + Ms))
        gcomp += (Ng / (Ng + Mg))
        spurity += (Ns / (Ns + Mg))
        scomp += (Ns / (Ns + Ms))
        acc += (1 - testErr)

    print 'with ' + str(k) + ' folds:'
    print('Average Galaxy Purity = ' + str(gpurity / k))
    print('Average Galaxy Completeness = ' + str(gcomp / k))
    print('Average Star Purity = ' + str(spurity / k))
    print('Average Star Completeness = ' + str(scomp / k))
    print('Average Accuracy = ' + str(acc / k))
예제 #28
0
def RF_train(data, filename):
    data_train = split_data(data)
    key_FT = data_train.map(lambda x: LabeledPoint(x[1], x[-1]))
    training, test = key_FT.randomSplit([0.8, 0.2], 0)
    model_RF = RandomForest.trainClassifier(training, 2, {}, 5, seed=42)
    predictionAndlabel = test.map(
        lambda x: (float(model_RF.predict(x.features)), x.label))
    accuracy = 1.0 * predictionAndlabel.filter(
        lambda (x, v): x == v).count() / test.count()
    print("accuracy of model_RF:%f" % accuracy)
    pre_all(data, model_RF, filename)
    return model_RF, accuracy
예제 #29
0
 def train_random_forest(train_rdd):
     # Build Model
     model = RandomForest.trainClassifier(train_rdd,
                                          numClasses=2,
                                          categoricalFeaturesInfo={},
                                          numTrees=15,
                                          featureSubsetStrategy="auto",
                                          impurity='gini',
                                          maxDepth=9,
                                          maxBins=32,
                                          seed=42)
     return model
예제 #30
0
def trainOptimalModel(trainingData, testData):
	print "\nTraining optimal Random Forest model started!"
	Utils.logTime()

	numTreesVals = [3,5,8]
	featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird']
	impurityVals = ['gini', 'entropy']
	maxDepthVals = [3,4,5,6,7]
	maxBinsVals = [8,16,32]

	optimalModel = None
	optimalNumTrees = None
	optimalFeatureSubsetStrategy = None
	optimalMaxDepth = None
	optimalImpurity = None
	optimalBinsVal = None
	minError = None

	try:
		for curNumTree in numTreesVals:
			for curFeatureSubsetStrategy in featureSubsetStrategyVals:
				for curImpurity in impurityVals:
					for curMaxDepth in maxDepthVals:
						for curMaxBins in maxBinsVals:
							model = RandomForest.trainClassifier(trainingData, 
																numClasses=2, 
																categoricalFeaturesInfo={}, 
														 		numTrees=curNumTree,
														 		featureSubsetStrategy=curFeatureSubsetStrategy,
														 		impurity=curImpurity, 
														 		maxDepth=curMaxDepth,
														 		maxBins=curMaxBins)
							testErr = Evaluation.evaluate(model, testData)
							if testErr < minError or not minError:
								minError = testErr
								optimalNumTrees = curNumTree
								optimalFeatureSubsetStrategy = curFeatureSubsetStrategy
								optimalImpurity = curImpurity
								optimalMaxDepth = curMaxDepth
								optimalBinsVal = curMaxBins
								optimalModel = model
	except:
		msg = "\nException during model training with below parameters:"
		msg += "\tnum trees: " + str(optimalNumTrees)
		msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy
		msg += "\timpurity: " + str(curImpurity)
		msg += "\tmaxDepth: " + str(curMaxDepth)
		msg += "\tmaxBins: " + str(curMaxBins)
		Utls.logMessage(msg)

	logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
	return optimalModel 
예제 #31
0
def predict_and_save(sc, train_data, X_test, num_trees, max_depth):
    '''Trains a Random Forest classifier.
    Loops over different values of trees and max depth.
    '''
    model = RandomForest.trainClassifier(sc.parallelize(train_data), 9, {}, num_trees,  maxDepth = max_depth)
    a = []
    for i in X_test:
         a.append(int(model.predict(i) + 1))
    trees_s = str(num_trees)
    depth_s = str(max_depth)
    string = 'submit' + trees_s + depth_s + '.txt'
    b = pd.DataFrame(a)
    b.to_csv(string, header = False, index = False)
예제 #32
0
def testClassification(trainingData, testData):    
	# Train a RandomForest model.    
	#  Empty categoricalFeaturesInfo indicates all features are continuous.    
	#  Note: Use larger numTrees in practice.    
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.    
	model = RandomForest.trainClassifier(trainingData, numClasses=2,categoricalFeaturesInfo={},numTrees=3, featureSubsetStrategy="auto",impurity='gini', maxDepth=4, maxBins=32)
	# Evaluate model on test instances and compute test error    
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()/float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification forest model:')
	print(model.toDebugString())
예제 #33
0
파일: tests.py 프로젝트: greatyan/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
예제 #34
0
def test_impurity(n):
    model1 = RandomForest.trainClassifier(trainingData,
                                          numClasses=4,
                                          categoricalFeaturesInfo={},
                                          numTrees=n,
                                          featureSubsetStrategy="auto",
                                          impurity='entropy',
                                          maxDepth=5,
                                          maxBins=32)
    predictions1 = model1.predict(testData.map(lambda x: x.features))
    labelsAndPredictions1 = testData.map(lambda lp: lp.label).zip(predictions1)
    testErr1 = labelsAndPredictions1.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    return testErr1
예제 #35
0
    def random_forest(self, train_sample, test_sample, impurity, num_trees):

        #import pdb; pdb.set_trace()
        rf_model = RandomForest.trainClassifier(train_sample,
                                                numClasses=2,
                                                categoricalFeaturesInfo={},
                                                numTrees=int(num_trees),
                                                featureSubsetStrategy="auto",
                                                impurity=impurity,
                                                maxDepth=5,
                                                seed=123)

        # Cross-validate on the model

        return self.cross_validate(rf_model, test_sample)
예제 #36
0
def main():
    ## Data
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target
    data = map(lambda (x, y): LabeledPoint(y, x), zip(X, Y))
    ## Modelling
    model = RandomForest.trainClassifier(sc.parallelize(data),
                                         3, {},
                                         3,
                                         seed=42)
    ## Prediction
    preds = [model.predict(_) for _ in X]
    ## Accuracy
    print(sum(preds == Y) * 1.0 / len(Y))
예제 #37
0
def get_rf_model(sc, train=None):
    model_path = 'rf.model'
    if train is None:
        model = RandomForestModel.load(sc, model_path)
    else:
        model = RandomForest.trainClassifier(train,
                                             numClasses=2,
                                             numTrees=10,
                                             categoricalFeaturesInfo={},
                                             featureSubsetStrategy="auto",
                                             impurity='gini',
                                             maxDepth=10,
                                             maxBins=100)
        model.save(sc, model_path)

    return model
예제 #38
0
def predictions_RF(train_data_labeled,test_data_labeled,RF_NUM_TREES):

    time_start=time.time()
    model_rf = RandomForest.trainClassifier(train_data_labeled, numClasses=10, categoricalFeaturesInfo={},
            numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity="gini",
            maxDepth=10, maxBins=32, seed=10)


    predictions = model_rf.predict(test_data_labeled.map(lambda x: x.features))
    predict_label = test_data_labeled.map(lambda x: x.label).repartition(1).saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/stage3")
    labels_and_predictions = test_data_labeled.map(lambda x: x.label).zip(predictions)
    rfAccuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data_labeled.count())

    time_end=time.time()
    time_rf=(time_end - time_start)
    print("=========================================================================================================")
    print("run time: {},RandomForest accuracy: {}".format(time_rf,rfAccuracy))
예제 #39
0
def testClassification(trainingData, testData):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\
        / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
예제 #41
0
def create_model(name, training):
    if name == 'logistic':
        print_box()
        print "Logistic Regression Model"
        print_box()
        model = LogisticRegressionWithLBFGS.train(training)
    elif name == 'tree':
        print_box()
        print "Decision Tree Model"
        print_box()
        model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
    elif name == 'rf':
        print_box()
        print "Random Forest Model"
        print_box()
        model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                    numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)

    return model
예제 #42
0
def kfolds ():
    #folds = kFold(data, k) this would work in java
    acc = 0
    spurity = 0
    scomp = 0
    gpurity = 0
    gcomp = 0
    foldsize = data.count()/k
    tested = sc.parallelize([])
    for i in range(k):
        test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize))
        tested = tested.union(test)
        train = data.subtract(test)
        # train the random forest
        model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=num_trees, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth = max_depth, maxBins=32)

        predictions = model.predict(test.map(lambda x: x.features))
        labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
        testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
        Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
        Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
        Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
        Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
        
        gpurity += (Ng / (Ng+Ms))
        gcomp += (Ng / (Ng+Mg))
        spurity += (Ns / (Ns+Mg))
        scomp += (Ns/(Ns+Ms))
        acc += (1 - testErr)
    
    print 'with '+ str(k) + ' folds:'
    print ('Average Galaxy Purity = ' + str(gpurity / k))
    print ('Average Galaxy Completeness = ' + str(gcomp / k))
    print ('Average Star Purity = ' + str(spurity / k))
    print ('Average Star Completeness = ' + str(scomp / k))
    print ('Average Accuracy = ' + str(acc / k))
예제 #43
0
    return LabeledPoint(label, features)

ml_data = sc.textFile('final_output_final.csv').map(parseLine)

(trainingData, testData) = ml_data.randomSplit([0.7, 0.3])

categoricalFeaturesInfo = {0:len(set(cols[agency_index])), 1:len(set(cols[comp_index])), 2:len(set(cols[loc_index])), 3:len(set(cols[incident_index])), 4:len(set(cols[add_index])), 5:len(set(cols[city_index])), 6:len(set(cols[fac_index])), 7:len(set(cols[status_index])), 8:len(set(cols[bor_index]))}
numClasses = 2
numTrees = 3
featureSubsetStrategy="auto"
impurity='gini'
maxDepth=20
maxBins= max(categoricalFeaturesInfo.values()) + 10

model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
                                     numTrees, featureSubsetStrategy,
                                     impurity, maxDepth, maxBins)

model.save(sc, "modelCritical")

predictions = model.predict(testData.map(lambda x: x.features))
#print "HERE I AM"
#print type(predictions.collect())
temp = predictions.collect()
print temp
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v == p).count() / float(testData.count())
print('Accuracy = ' + str(testErr))

'''
predictList = []
# MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/spam_docvecs")

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.

rr = RandomForest.trainClassifier(
    trainingData,
    numClasses=2,
    categoricalFeaturesInfo={},
    numTrees=3,
    featureSubsetStrategy="auto",
    impurity="gini",
    maxDepth=4,
    maxBins=32,
)

predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
posErr = (
    float(labelsAndPredictions.filter(lambda (v, p): v == 0.0 and v != p).count())
    / testData.filter(lambda lp: lp.label == 0.0).count()
)
negErr = (
    float(labelsAndPredictions.filter(lambda (v, p): v == 1.0 and v != p).count())
    / testData.filter(lambda lp: lp.label == 1.0).count()
)
예제 #45
0
    # categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57)
    # data.cache()
    # mappings = [get_mapping(data, i) for i in categorical]


    labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1]))

    return labelpoints

data = label_points(data_raw)
training, testing = data.randomSplit([0.5, 0.5], 0)


model = RandomForest.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={},
                                     numTrees=1000, featureSubsetStrategy="auto",
                                     impurity='gini', maxBins=32)

predictions = model.predict(testing.map(lambda x: x.features))
labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions)
accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testing.count())
print accuracy


#
# # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false
# # Page 166
# def get_mapping(rdd, idx):
#     return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
#
# # cat_len = sum(map(len, mappings))
import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

sc = pyspark.SparkContext(appName="RandomForest")

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/PartyVectors/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()

# The depth of the tree proved to be a significant bottle neck
model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={},
                                     numTrees=700, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=8, maxBins=12)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print("")
print("")
print('Test Error: ' + str(testErr))
trainTitanic.take(10)
 
# creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])"
trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]]))
trainTitanicLP.first()
 
# splitting dataset into train and test set
# 70% train, 30% test 
(trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3])
 
# Random forest : same parameters as sklearn (?)
from pyspark.mllib.tree import RandomForest
 
time_start=time.time()
model_rf = RandomForest.trainClassifier(trainData, numClasses = 2,
        categoricalFeaturesInfo = {}, numTrees = 100,
        featureSubsetStrategy='auto', impurity='gini', maxDepth=12,
        maxBins=32, seed=None)
 
  
model_rf.numTrees()
model_rf.totalNumNodes()
time_end=time.time()
time_rf=(time_end - time_start)
print("RF takes %d s" %(time_rf))
 
# Predictions on test set
predictions = model_rf.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
 
# first metrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
예제 #48
0
파일: RF.py 프로젝트: bngonmang/FIND
 def train(self, trainingData):
     self.model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                  numTrees=10, featureSubsetStrategy="auto",
                                  impurity='gini', maxDepth=4, maxBins=32)
예제 #49
0
def createRFC(rdd, numclasses, catfeatinfo):

    # overfishing tune later
    return RandomForest.trainClassifier(rdd, numClasses=numclasses,
                                         categoricalFeaturesInfo=catfeatinfo,
                                         numTrees=3)
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for survived, record in sample_data:
        lp = LabeledPoint(survived, tuple(record))
        lparr.append(lp)

    for survived, record in test_data:
        lp = LabeledPoint(survived, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    classificationModel = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={},
                                                       numTrees=3)
    result = classificationModel.predict(test_data_rdd.map(lambda x: x.features))
    print classificationModel
    print classificationModel.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    print accuracy_score(actual_data,predicted_data)
    print(classificationModel)

    #for p in predicted_data:
    #    print p
    break
    dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)
    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
    #                                     categoricalFeaturesInfo=categoricalFeaturesInfo)
    model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, )
    # Print learned tree and stats.
    print origToNewLabels
    print "Trained DecisionTree for classification:"
#    print "  Model numNodes: %d" % model.numNodes()
#    print "  Model depth: %d" % model.depth()
    print "  Training accuracy: %g" % getAccuracy(model, reindexedData)
#    if model.numNodes() < 20:
#        print model.toDebugString()
#    else:
#        print model
    print model
#    testdata = MLUtils.loadLibSVMFile(sc, 'test_svm')
#reuben
    predictions =  model.predict(testdata.map(lambda x: x.features))
#    labels = testdata.map(lambda l:l.label)
예제 #52
0
data = sc.textFile("team_result.txt")
data = data.map(lambda line: line.split(","))
data = data.map(lambda x: LabeledPoint(float(x[5]), [x[0], x[1], x[2], x[3], x[4]]))

# Split the dataset into training set (70%) and test set (30%)
trainingData, testData = data.randomSplit([0.7, 0.3], seed=1071)

# Create and train the naive Bayes model
naiveBayesModel = NaiveBayes.train(trainingData, 1.0)

# Apply the model to the test set
predictionAndLabelNaiveBayes = testData.map(lambda x: (naiveBayesModel.predict(x.features), x.label))

# Calculate the accuracy of the model
errorNaiveBayes = 1.0 * predictionAndLabelNaiveBayes.filter(lambda (x, y): x != y).count() / testData.count()
print "Naive Bayes model classification error: {0:f}".format(errorNaiveBayes)

# Create and train the random forest model
randomForestModel = RandomForest.trainClassifier(trainingData, numClasses=2,
                                                 categoricalFeaturesInfo={0: 9, 1: 9, 2: 9, 3: 9, 4: 9}, numTrees=3,
                                                 impurity="gini", maxDepth=4, maxBins=32, seed=1071)

'''
Note taken from the official API documentation:
In Python, predict cannot currently be used within an RDD
transformation or action. Call predict directly on the RDD instead.
'''
predictionsRandomForest = randomForestModel.predict(testData.map(lambda x: x.features))
labelsAndPredictionsRF = testData.map(lambda x: x.label).zip(predictionsRandomForest)
errorRandomForest = labelsAndPredictionsRF.filter(lambda (x, y): x != y).count() / float(testData.count())
print "Random forest classification error: {0:f}".format(errorRandomForest)
예제 #53
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
예제 #54
0
# ### Reducing data size

# In[31]:

Data1=Data.sample(False,0.1, seed=255).cache()
(trainingData,testData)=Data1.randomSplit([0.7,0.3],seed=255)
trainingData.cache()
testData.cache()

# ### RANDOM

from pyspark.mllib.tree import RandomForest, RandomForestModel

errors={}
for depth in [16]:
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=18, maxDepth=depth)
    errors[depth]={}
    dataSets={'train':trainingData,'test':testData}
    for name in dataSets.keys():  # Calculate errors on train and test sets
        data=dataSets[name]
        Predicted = model.predict(data.map(lambda x: x.features))
        LabelsAndPredictions=data.map(lambda lp: lp.label).zip(Predicted)
        Err = LabelsAndPredictions.filter(lambda (v,p):v != p).count()/float(data.count())
        errors[depth][name]=Err
    print depth,errors[depth]

errors={}
for depth in [18]:
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=15, maxDepth=depth)
    errors[depth]={}
예제 #55
0
    numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1
labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2])))

unbalance_test = data_ans_0827.map(feature_char_to_num).cache()
l_unbal_te = unbalance_test.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))


#splite data to trainData and testData
(trianData, testData) = labeled_data.randomSplit([0.9, 0.1])

len_list = [len(i) for i in fe]
col_na_l = [i-1 for i in col_na]  #because slice out the first data in vector [1:-2]
col_na_l = [i-1 for i in col_na_l if i >= 83]   #for drop out the 85th col
features_dict = dict(zip(col_na_l, len_list))  #feature dict eg. {1:3, 5:8}
model = RandomForest.trainClassifier(trianData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=50, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=5, maxBins=32)


# Evaluate model on test instances and compute test error
predictions = model.predict(l_unbal_te.map(lambda x: x.features))
labelsAndPredictions = l_unbal_te.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(l_unbal_te.count())
print('Test Error = ' + str(testErr) +"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
true_positive = labelsAndPredictions.filter(lambda (v,p):v==p and p==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==1).count())
print "true_positive", true_positive, "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
f_true = labelsAndPredictions.filter(lambda (v,p):v==p and v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):p==1).count())
print "precision=TP/(TP+Fp)", f_true, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
print "1/0",labelsAndPredictions.filter(lambda (v,p):v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==0).count()), "##############################################################################################"
#print "False", labeled_data.filter(lambda p:p.label==0).count(), "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2"
#print "Positive",  labeled_data.filter(lambda p:p.label==1).count(),"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
예제 #56
0
파일: tests.py 프로젝트: HodaAlemi/spark
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
def trainRandomForest(data):

    return RandomForest.trainClassifier(data, numClasses=9, categoricalFeaturesInfo={},
                                        numTrees=10, featureSubsetStrategy="auto",
                                        impurity='gini', maxDepth=30, maxBins=32)
    #trainingData = trainRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],[k.strip() for k in x.split(",") if k][1:4]))
    #testData = testRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],[k.strip() for k in x.split(",") if k][1:4]))

    # new transformed dataset:  nxtLoc <-- Day+ CurrLoc+ NxtTimeInt
    trainingData = trainRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],list([k.strip() for k in x.split(",") if k][1:2])+[k.strip() for k in x.split(",") if k][3:5]))
    testData = testRDD.map(lambda x: LabeledPoint([k.strip() for k in x.split(",") if k][-1],list([k.strip() for k in x.split(",") if k][1:2])+[k.strip() for k in x.split(",") if k][3:5]))
 
    #model = DecisionTree.trainClassifier(trainingData, numClasses=400, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=10, maxBins=32)
    #ModelDict[name] = model

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=400, categoricalFeaturesInfo={},numTrees=17, featureSubsetStrategy="auto", impurity='entropy', maxDepth=9, maxBins=32)


    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    avg = avg+testErr
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())
    errorPredictions.write("Region ID: "+str(name)+"   Test Error = : "+str(testErr)+" Time taken:"+str((time.time() - start_time))+" trainRDD count: "+str(trainRDD.count())+" testRDD count: "+ str(testRDD.count())+'\n')
    errorPredictionsTree.write("Region ID: "+str(name)+"  Test Error rate: "+str(testErr)+'\n'+"Model:"+"\n"+model.toDebugString())

print "Average error rate: "+str(avg/count)
errorPredictions.write("Average error rate: "+str(avg/count))