Пример #1
0
def rfTest(sqlContext,dataset_rdd):
	dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
	dataset_negotive =  dataset_rdd.filter(lambda e:e[1]<0.5)
	train_positive = dataset_positive.sample(False,0.8)
	test_positive = dataset_positive.subtract(train_positive)
	train_negotive = dataset_negotive.sample(False,0.8)
	test_negotive = dataset_negotive.subtract(train_negotive)
	trainset_rdd = train_positive.union(train_negotive)
	testset_rdd = test_positive.union(test_negotive)
	trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	trainset_nums = trainset.count()
	testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
	testset_nums = testset.count()
	trainset_positive = train_positive.count()
	testset_positive = test_positive.count()
	model = RandomForest.trainClassifier(trainset,2,{},3)
	predictions = model.predict(testset.map(lambda x:x.features))
	predict = testset.map(lambda lp: lp.label).zip(predictions)
	hitALL =predict.filter(lambda e:e[0]==e[1]).count()
	hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
	positive = predict.filter(lambda e:e[1]>0.5).count()
	recallPositive = hitPositive/float(testset_positive)
	precision = hitPositive/float(positive)
	accuracy = hitALL/float(testset.count())
	F_Value = 2/(1/precision+1/recallPositive)
	return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
Пример #2
0
def Random_Forest(filename, sc):

	filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
	# Load and parse the data file into an RDD of LabeledPoint.
	data = MLUtils.loadLibSVMFile(sc, filename)
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a RandomForest model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous.
	#  Note: Use larger numTrees in practice.
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
	                                     numTrees=3, featureSubsetStrategy="auto",
	                                     impurity='gini', maxDepth=4, maxBins=32)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification forest model:')
	print(model.toDebugString())

	# Save and load model
	#model.save(sc, "target/tmp/myRandomForestClassificationModel")
	#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
Пример #3
0
def generateRandomForest():
    if os.path.exists(RF_PATH):
        print("RF_PATH Already available")
        return

    data = sc.textFile(F_PATH).map(parseLine)

    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)

    # Train a RandomForest model.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
                                         numTrees=4, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error', str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())

    modelStatistics(labelsAndPredictions)

    # Save and load model
    model.save(sc, RF_PATH)
    print("Saved RF Model.")
Пример #4
0
def testOnce ():
    # split the data into training and testing sets
    (trainingData, testData) = data.randomSplit([1-test_size, test_size])

    # train the random forest
    model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=num_trees, featureSubsetStrategy = strat,
                                     impurity='gini', maxDepth = max_depth, maxBins=32)

    # test the random forest
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
    Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
    Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
    Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
    probsAndScores = probTest(testData, model)
    threshold_accuracy = probsAndScores[0]
    probs = probsAndScores[1].map(lambda x: x/num_trees)
    labelsAndPredictions = labelsAndPredictions.zip(probs)
    labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs)
    save(labelsAndProbs, 'answers')
    print ('Galaxy Purity = ' + str(Ng / (Ng+Ms)))
    print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg)))
    print ('Star Purity = ' + str(Ns / (Ns+Mg)))
    print ('Star Completeness = ' + str(Ns/(Ns+Ms)))
    print ('Accuracy = ' + str(1 - testErr))
    print ('Threshold method accuracy = ' + str(threshold_accuracy))
Пример #5
0
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Truncate the last 2 features of the data
    for dataPoint in train_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    for dataPoint in test_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)

    # Train the models
    randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
                                         numTrees=750, seed=42, maxDepth=30, maxBins=32)

    # Test the model
    testRandomForest(randomForestModel, parallelized_test_set)
Пример #6
0
def main():
    input_train = sys.argv[1]
    input_test = sys.argv[2]

    conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'

    train = sc.textFile(input_train).cache()
    test = sc.textFile(input_test).cache()

    '''sbaronia - get training and testing labeled points'''
    train_lp = train.map(to_labeledpoint).cache()
    test_lp = test.map(to_labeledpoint).cache()

    '''sbaronia - run RandomForest regression on our training data with
    default options except numTrees = 5'''
    rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
    
    '''sbaronia - run predictions on testing data and calculate RMSE value'''
    predictions = rf_model.predict(test_lp.map(lambda x: x.features))
    labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
    rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))

    print("RMSE = " + str(rmse))
Пример #7
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Пример #8
0
def trainRandomForestModel(data):
    """
    Train a random forest regression model and return it
    :param data: RDD[LabeledPoint]
    :return: random forest regression model
    """
    from pyspark.mllib.tree import RandomForest
    model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
    return model
 def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\
     depth=4):
     """
     训练模型
     """
     model = RandomForest.trainClassifier(trainData, numClasses=2,\
         categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \
         featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\
         maxBins=32)
     return model
Пример #10
0
def trainModel(trainingData):
	print "\nTrainning Random Forest model started!"
	Utils.logTime()

	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, 
											numTrees=3, featureSubsetStrategy="auto", impurity='gini',
											maxDepth=5, maxBins=32)

	print '\nTraining Random Forest model finished'
	Utils.logTime()
	return model
Пример #11
0
 def evaluate(self, trainingData,  testData=None, metric=None):
     if testData !=None:
         model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                  numTrees=10, featureSubsetStrategy="auto",
                                  impurity='gini', maxDepth=4, maxBins=32)
         predictions = model.predict(testData.map(lambda x: x.features))
         labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
         testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
         print('Test Error = ' + str(testErr))
     else: #cross validation
         pass
Пример #12
0
def getRandomForestRMSE(trees_array):
	valRMSE_list = []
	for trees in trees_array:
		model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
                                    numTrees=trees, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)
		predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
		labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
		valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
		valRMSE=valMSE**0.5
		valRMSE_list.append((trees, valRMSE))
	return valRMSE_list
Пример #13
0
def trainOptimalModel(trainingData, testData):
	print "\nTraining optimal Random Forest model started!"
	Utils.logTime()

	numTreesVals = [3,5,8]
	featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird']
	impurityVals = ['gini', 'entropy']
	maxDepthVals = [3,4,5,6,7]
	maxBinsVals = [8,16,32]

	optimalModel = None
	optimalNumTrees = None
	optimalFeatureSubsetStrategy = None
	optimalMaxDepth = None
	optimalImpurity = None
	optimalBinsVal = None
	minError = None

	try:
		for curNumTree in numTreesVals:
			for curFeatureSubsetStrategy in featureSubsetStrategyVals:
				for curImpurity in impurityVals:
					for curMaxDepth in maxDepthVals:
						for curMaxBins in maxBinsVals:
							model = RandomForest.trainClassifier(trainingData, 
																numClasses=2, 
																categoricalFeaturesInfo={}, 
														 		numTrees=curNumTree,
														 		featureSubsetStrategy=curFeatureSubsetStrategy,
														 		impurity=curImpurity, 
														 		maxDepth=curMaxDepth,
														 		maxBins=curMaxBins)
							testErr = Evaluation.evaluate(model, testData)
							if testErr < minError or not minError:
								minError = testErr
								optimalNumTrees = curNumTree
								optimalFeatureSubsetStrategy = curFeatureSubsetStrategy
								optimalImpurity = curImpurity
								optimalMaxDepth = curMaxDepth
								optimalBinsVal = curMaxBins
								optimalModel = model
	except:
		msg = "\nException during model training with below parameters:"
		msg += "\tnum trees: " + str(optimalNumTrees)
		msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy
		msg += "\timpurity: " + str(curImpurity)
		msg += "\tmaxDepth: " + str(curMaxDepth)
		msg += "\tmaxBins: " + str(curMaxBins)
		Utls.logMessage(msg)

	logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
	return optimalModel 
Пример #14
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #15
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #16
0
def testRegression(trainingData, testData):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
        .sum() / float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression forest model:')
    print(model.toDebugString())
Пример #17
0
def testClassification(trainingData, testData):
    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\
        / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
Пример #19
0
def create_model(name, training):
    if name == 'logistic':
        print_box()
        print "Logistic Regression Model"
        print_box()
        model = LogisticRegressionWithLBFGS.train(training)
    elif name == 'tree':
        print_box()
        print "Decision Tree Model"
        print_box()
        model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
    elif name == 'rf':
        print_box()
        print "Random Forest Model"
        print_box()
        model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                    numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)

    return model
Пример #20
0
def kfolds ():
    #folds = kFold(data, k) this would work in java
    acc = 0
    spurity = 0
    scomp = 0
    gpurity = 0
    gcomp = 0
    foldsize = data.count()/k
    tested = sc.parallelize([])
    for i in range(k):
        test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize))
        tested = tested.union(test)
        train = data.subtract(test)
        # train the random forest
        model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=num_trees, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth = max_depth, maxBins=32)

        predictions = model.predict(test.map(lambda x: x.features))
        labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
        testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
        Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
        Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
        Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
        Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
        
        gpurity += (Ng / (Ng+Ms))
        gcomp += (Ng / (Ng+Mg))
        spurity += (Ns / (Ns+Mg))
        scomp += (Ns/(Ns+Ms))
        acc += (1 - testErr)
    
    print 'with '+ str(k) + ' folds:'
    print ('Average Galaxy Purity = ' + str(gpurity / k))
    print ('Average Galaxy Completeness = ' + str(gcomp / k))
    print ('Average Star Purity = ' + str(spurity / k))
    print ('Average Star Completeness = ' + str(scomp / k))
    print ('Average Accuracy = ' + str(acc / k))
    def train_amount_model(self, model, data, i):
        rdd_data = self.sc.parallelize(data)
        self.logger.info('Start to train the amount model')
        if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
            input_num = self.feature_num
            layers = [input_num, input_num / 3 * 2, input_num / 3, 1]

            neural_network = NeuralNetworkSpark(layers=layers, bias=0)
            model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
                                         iteration=15, model=model)
        elif self.amount_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
                                                featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
                                                maxBins=32)

        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                  initialWeights=model.weights if model is not None else None)

        else:
            self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
            raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
        return model
Пример #22
0
data = sc.textFile("team_result.txt")
data = data.map(lambda line: line.split(","))
data = data.map(lambda x: LabeledPoint(float(x[5]), [x[0], x[1], x[2], x[3], x[4]]))

# Split the dataset into training set (70%) and test set (30%)
trainingData, testData = data.randomSplit([0.7, 0.3], seed=1071)

# Create and train the naive Bayes model
naiveBayesModel = NaiveBayes.train(trainingData, 1.0)

# Apply the model to the test set
predictionAndLabelNaiveBayes = testData.map(lambda x: (naiveBayesModel.predict(x.features), x.label))

# Calculate the accuracy of the model
errorNaiveBayes = 1.0 * predictionAndLabelNaiveBayes.filter(lambda (x, y): x != y).count() / testData.count()
print "Naive Bayes model classification error: {0:f}".format(errorNaiveBayes)

# Create and train the random forest model
randomForestModel = RandomForest.trainClassifier(trainingData, numClasses=2,
                                                 categoricalFeaturesInfo={0: 9, 1: 9, 2: 9, 3: 9, 4: 9}, numTrees=3,
                                                 impurity="gini", maxDepth=4, maxBins=32, seed=1071)

'''
Note taken from the official API documentation:
In Python, predict cannot currently be used within an RDD
transformation or action. Call predict directly on the RDD instead.
'''
predictionsRandomForest = randomForestModel.predict(testData.map(lambda x: x.features))
labelsAndPredictionsRF = testData.map(lambda x: x.label).zip(predictionsRandomForest)
errorRandomForest = labelsAndPredictionsRF.filter(lambda (x, y): x != y).count() / float(testData.count())
print "Random forest classification error: {0:f}".format(errorRandomForest)
Пример #23
0
def train_randomforest_model(dataset):

    model = RandomForest.trainClassifier(dataset, 2, {}, 3, seed=42)

    return model
Пример #24
0
val_data = truetestData.map(lambda line: LabeledPoint(line[7], line[0:7]))

# debug
print(data.take(1))
print(val_data.take(1))

# for holdout validation
(trData, tData) = data.randomSplit([0.7, 0.3])

# random forest training model
mod = RandomForest.trainRegressor(trData,
                                  categoricalFeaturesInfo={
                                      0: 13,
                                      1: 1499,
                                      2: 2
                                  },
                                  numTrees=4,
                                  featureSubsetStrategy="auto",
                                  impurity='variance',
                                  maxDepth=8,
                                  maxBins=1500)

# prediction and evaluation
predictions = mod.predict(tData.map(lambda x: x.features))
pred = mod.predict(val_data.map(lambda x: x.features))
labelsAndPredictions = tData.map(lambda lp: lp.label).zip(predictions)
truePred = val_data.map(lambda lp: lp.label).zip(pred)
metrics = RegressionMetrics(labelsAndPredictions)
met2 = RegressionMetrics(truePred)
# Squared Error
print("Validation MSE = %s" % metrics.meanSquaredError)
Пример #25
0
                                 (model1.predict(p.features), p.label))

#model 2
from pyspark.mllib.classification import SVMWithSGD

model2 = SVMWithSGD.train(training, iterations=100)
predictionAndLabel_SVM = test.map(lambda p:
                                  (model2.predict(p.features), p.label))

#model 3
from pyspark.mllib.tree import RandomForest

model = RandomForest.trainClassifier(training,
                                     numClasses=6,
                                     numTrees=2,
                                     categoricalFeaturesInfo={},
                                     featureSubsetStrategy="auto",
                                     maxDepth=6,
                                     maxBins=32)
predictions = model.predict(test.map(lambda x: x.features))
predictionAndLabel_RF = test.map(lambda lp: lp.label).zip(predictions)


# -------------- La phase prediction ------------#
def accuracy(predictionAndLabel):
    return 1.0 * predictionAndLabel.filter(
        lambda pl: pl[0] == pl[1]).count() / test.count()


print('model accuracy {}'.format(accuracy(predictionAndLabel_NB)))
print('model accuracy {}'.format(accuracy(predictionAndLabel_SVM)))
Пример #26
0
# Evaluate model on test instances and compute test error
predictions = GBTmodel.predict(test_dense.rdd.map(lambda x: x.features.values))
labelsAndPredictions = test_dense.rdd.map(lambda lp: lp.label).zip(predictions)

testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test_dense.rdd.count())
print('Test Error = ' + str(testErr))

from pyspark.mllib.tree import RandomForest, RandomForestModel

print('Learned classification RF model:')
train_start = time.time()
RFmodel = RandomForest.trainClassifier(labelPoint_train,
                                       numClasses=2,
                                       categoricalFeaturesInfo={},
                                       numTrees=30,
                                       featureSubsetStrategy="auto",
                                       impurity='gini',
                                       maxDepth=4,
                                       maxBins=32)
train_end = time.time()
print(f'Time elapsed training model: {train_end - train_start} seconds')

predictions = RFmodel.predict(test_dense.rdd.map(lambda x: x.features.values))
labelsAndPredictions = test_dense.rdd.map(lambda lp: lp.label).zip(predictions)

testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test_dense.rdd.count())
print('Test Error = ' + str(testErr))

spark.stop()
Пример #27
0
def takeAndPrint(time, rdd, num=1000):
    sqlContext = SQLContext(sc)
    result = []
    taken = rdd.take(num + 1)
    print("-------------------------------------------")
    print("Time: %s" % time)
    print("-------------------------------------------")
    for record in taken[:num]:
        vals = tuple(record.split(","))  #[tuple(['Alice', '1'])]
        result.append(vals)

    print(type(result))
    print(result)
    df = sqlContext.createDataFrame(result).collect()
    df.show()
    # Dataframe for MLLIB's
    # panda dataframe
    sample_data = df.sample(False, 0.5, 83).toPandas()
    sample_data.head()
    # find category and numerical variables
    numeric_cols = [
        "account_length", "number_vmail_messages", "total_day_minutes",
        "total_day_calls", "total_day_charge", "total_eve_minutes",
        "total_eve_calls", "total_eve_charge", "total_night_minutes",
        "total_night_calls", "total_intl_minutes", "total_intl_calls",
        "total_intl_charge"
    ]

    categorical_cols = [
        "state", "international_plan", "voice_mail_plan", "area_code"
    ]

    #some plots

    ax = sb.boxplot(x="churned",
                    y="number_customer_service_calls",
                    data=sample_data,
                    palette="Set3")

    ax.set(xlabel="Churned",
           ylabel="Number of calls made to the customer service")
    plt.show()
    example_numeric_data = sample_data[[
        "total_day_minutes", "total_day_calls", "total_day_charge", "churned"
    ]]
    sb.pairplot(example_numeric_data, hue="churned", palette="husl")
    plt.show()
    # correlation and heatmap
    corr = sample_data[[
        "account_length", "number_vmail_messages", "total_day_minutes",
        "total_day_calls", "total_day_charge", "total_eve_minutes",
        "total_eve_calls", "total_eve_charge", "total_night_minutes",
        "total_night_calls", "total_intl_minutes", "total_intl_calls",
        "total_intl_charge"
    ]].corr()

    sb.heatmap(corr)
    reduced_numeric_cols = [
        "account_length", "number_vmail_messages", "total_day_calls",
        "total_day_charge", "total_eve_calls", "total_eve_charge",
        "total_night_calls", "total_intl_calls", "total_intl_charge"
    ]

    label_indexer = StringIndexer(inputCol='churned', outputCol='label')
    plan_indexer = StringIndexer(inputCol='intl_plan',
                                 outputCol='intl_plan_indexed')

    assembler = VectorAssembler(inputCols=['intl_plan_indexed'] +
                                reduced_numeric_cols,
                                outputCol='features')

    classifier = DecisionTreeClassifier(labelCol='label',
                                        featuresCol='features')

    pipeline = Pipeline(
        stages=[plan_indexer, label_indexer, assembler, classifier])

    (train, test) = df.randomSplit([0.7, 0.3])
    model = pipeline.fit(train)

    # Random forest
    from pyspark.mllib.tree import RandomForest
    model2 = RandomForest.trainClassifier(train,
                                          numClasses=2,
                                          numTrees=3,
                                          featureSubsetStrategy="auto",
                                          impurity='gini',
                                          maxDepth=4,
                                          maxBins=32)

    # SVM needs some tweaking- not working as expected

    # ROC chart
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    predictions = model.transform(test)
    evaluator = BinaryClassificationEvaluator()
    auroc = evaluator.evaluate(predictions,
                               {evaluator.metricName: "areaUnderROC"})
Пример #28
0
def main():
    sc = SparkContext(conf=SparkConf().setAppName("Random Forest"))
    sqlContext = SQLContext(sc)
    bytePath = "s3n://eds-uga-csci8360/data/project2/binaries"
    namePath = "s3n://eds-uga-csci8360/data/project2/labels/X_train_small.txt"
    nameTestPath = "s3n://eds-uga-csci8360/data/project2/labels/X_test_small.txt"
    classPath = "s3n://eds-uga-csci8360/data/project2/labels/y_train_small.txt"

    #bytePath =  "/Users/priyanka/Desktop/project2files/all"
    #namePath = "/Users/priyanka/Desktop/X_train_small.txt"
    #nameTestPath="/Users/priyanka/Desktop/X_test_small.txt"
    #classPath = "/Users/priyanka/Desktop/y_train_small.txt"

    #docData Output: ('file:/Users/priyanka/Desktop/project2files/train/04mcPSei852tgIKUwTJr.bytes', '00401000 20 FF 58 C0 20 FE 5C 01 F8 00 0F 8B 50 FC 06 01\r\n00401010 8C 01 FF")
    docData = sc.wholeTextFiles(
        bytePath,
        25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8")))
    print("docData frankie")
    docData.take(1)

    #clean docData here - remove 1st word from line and remove /r/n
    cleanDocData = docData.map(lambda (x, y): (x, clean(y.split())))

    #try calculating tf here (filename,tf)
    x = 16**2 + 1
    hashingTF = HashingTF(x)
    tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y)))
    tfDocData.take(1)
    #Output format : (index,filename)
    nameData = sc.textFile(
        namePath, 25).map(lambda x: "file:" + bytePath + "/" + x + ".bytes"
                          ).zipWithIndex().map(lambda (x, y): (y, x))
    #nameData.take(5)

    #Output format: (index,label)
    labelData = sc.textFile(
        classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1)))

    #Output format: (filename,label)
    joinNameLabel = nameData.join(labelData).map(lambda (x, y): y)
    #joinNameLabel.take(5)

    #Output: (label,tfidf)
    joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y)

    #Output: (label,tfidf)
    hashData = joinCleanDocLabel.map(lambda
                                     (label, text): LabeledPoint(label, text))
    print "hashing TF done"

    print("generating model fliss")
    model1 = RandomForest.trainClassifier(hashData,
                                          numClasses=9,
                                          categoricalFeaturesInfo={},
                                          numTrees=50,
                                          featureSubsetStrategy="auto",
                                          impurity='gini',
                                          maxDepth=8,
                                          maxBins=32)

    #==============================================================================
    # Testing starts here
    #==============================================================================
    #Output: (filename,index)
    nameTestData = sc.textFile(nameTestPath, 25).map(
        lambda x: "file:" + bytePath + "/" + x + ".bytes").zipWithIndex()

    #Output: (index,tfidf)
    joinTestDocLabel = nameTestData.join(tfDocData).map(lambda (x, y): y)

    print("hashing test kenny")
    hashTestData = joinTestDocLabel.map(
        lambda (label, text): LabeledPoint(label, text))
    hashTestData.persist()

    #Random forest prediction and labels and accuracy
    print "prediction part lyndz"
    prediction1 = model1.predict(hashTestData.map(lambda x: x.features))

    prediction1.saveAsTextFile("/Users/priyanka/Desktop/pred.txt")
Пример #29
0
# Evaluate model on test instances and compute test error
predictions = modelDT.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testAccuracy = labelsAndPredictions.filter(
    lambda (v, p): v == p).count() / float(testData.count())
print('Decision Tree Test Accuracy =' + str(testAccuracy))

# Print the predictions
print("Acutual vs Predicted values - Decision Tree")
print(labelsAndPredictions.collect())

# Train a RandomForest model
modelRF = RandomForest.trainClassifier(trainingData,
                                       numClasses=3,
                                       categoricalFeaturesInfo={},
                                       numTrees=3,
                                       featureSubsetStrategy="auto",
                                       impurity='gini',
                                       maxDepth=4,
                                       maxBins=32)

# Evaluate model on test instances and compute test error
predictions = modelRF.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testAccuracy = labelsAndPredictions.filter(
    lambda (v, p): v == p).count() / float(testData.count())
print('Random Forest Test Accuracy =' + str(testAccuracy))

# Print the predictions
print("Acutual vs Predicted values - Random Forest")
print(labelsAndPredictions.collect())
Пример #30
0
results = []
for train_index, test_index in ss:
    X_training, Y_training, X_test, Y_test = [], [], [], []
    for i in train_index:
        X_training.append(X[i])
        Y_training.append(Y[i])
    for i in test_index:
        X_test.append(X[i])
        Y_test.append(Y[i])

    parsedData = []
    for i in range(0, len(X_training)):
        parsedData.append(LabeledPoint(Y_training[i], X_training[i]))

    model = RandomForest.trainClassifier(sc.parallelize(parsedData),
                                         2, {},
                                         3,
                                         seed=42)

    testErr = 0
    for i in range(0, len(X_test)):
        a = Y_test[i]
        b = model.predict(X_test[i])
        #b = 1
        if a != b:
            testErr += 1

    Err += float(testErr) / float(len(X_test))

print("AVG test error: %.6f" % (Err / iter_number))
Пример #31
0
    # categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57)
    # data.cache()
    # mappings = [get_mapping(data, i) for i in categorical]

    labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1]))

    return labelpoints


data = label_points(data_raw)
training, testing = data.randomSplit([0.5, 0.5], 0)

model = RandomForest.trainClassifier(training,
                                     numClasses=7,
                                     categoricalFeaturesInfo={},
                                     numTrees=1000,
                                     featureSubsetStrategy="auto",
                                     impurity='gini',
                                     maxBins=32)

predictions = model.predict(testing.map(lambda x: x.features))
labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions)
accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    testing.count())
print accuracy

#
# # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false
# # Page 166
# def get_mapping(rdd, idx):
#     return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
Пример #32
0
def train_random_forest(trainRDD, num_trees, max_depth):
    return RandomForest.trainClassifier(trainRDD,
                                        2, {},
                                        num_trees,
                                        maxDepth=max_depth)
Пример #33
0
from pyspark.mllib.evaluation import MulticlassMetrics

trainDataPath = "s3://cloudpa2/TrainingDataset.csv"
valDataPath = "s3://cloudpa2/ValidationDataset.csv"

sc = SparkContext.getOrCreate()
sc.getConf().setAppName('cloudpa2')

rawData = sc.textFile(trainDataPath)
records = rawData.filter(lambda str: "\"" not in str).map(
    lambda str: str.split(";")).map(lambda strVal: [float(x) for x in strVal])
data = records.map(
    lambda arr: LabeledPoint(int(arr[-1]) - 1, Vectors.dense(arr[:-1])))

valRawData = sc.textFile(valDataPath)
valRecords = valRawData.filter(lambda str: "\"" not in str).map(
    lambda str: str.split(";")).map(lambda strVal: [float(x) for x in strVal])
valData = valRecords.map(lambda arr: (arr[:-1], int(arr[-1]) - 1))

rfModel = RandomForest.trainClassifier(data,
                                       numClasses=10,
                                       categoricalFeaturesInfo={},
                                       numTrees=100)
rfModelPredictionAndLabels = rfModel.predict(
    valData.map(lambda tp: tp[0])).zip(valData.map(lambda _: float(_[1])))

rfModelMetric = MulticlassMetrics(rfModelPredictionAndLabels)
print("F1Score : %s" % (rfModelMetric.accuracy))

rfModel.save(sc, "s3://myprogrambucket123/rfwine_model.model")
    dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)
    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    #model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
    #                                     categoricalFeaturesInfo=categoricalFeaturesInfo)
    model = RandomForest.trainClassifier(reindexedData, numClasses=numClasses,categoricalFeaturesInfo={},numTrees=30,featureSubsetStrategy='auto', impurity='gini', maxDepth=8, maxBins=40, )
    # Print learned tree and stats.
    print origToNewLabels
    print "Trained DecisionTree for classification:"
#    print "  Model numNodes: %d" % model.numNodes()
#    print "  Model depth: %d" % model.depth()
    print "  Training accuracy: %g" % getAccuracy(model, reindexedData)
#    if model.numNodes() < 20:
#        print model.toDebugString()
#    else:
#        print model
    print model
#    testdata = MLUtils.loadLibSVMFile(sc, 'test_svm')
#reuben
    predictions =  model.predict(testdata.map(lambda x: x.features))
#    labels = testdata.map(lambda l:l.label)
Пример #35
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
df = spark.read.format('com.databricks.spark.csv').csv(
    's3://cs643-wine/TrainingDataset.csv', header=True, sep=";")

# def parsePoint(line):
#     # values = [float(x) for x in line.split(';')]
#     return LabeledPoint(values[11], values[0:10])

parsedTData = df.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[:11])))

#Training data using Random Forest
model = RandomForest.trainClassifier(parsedTData,
                                     numClasses=11,
                                     categoricalFeaturesInfo={},
                                     numTrees=3,
                                     impurity='gini',
                                     maxDepth=4,
                                     maxBins=32)

# vData = sc.textFile("ValidationDataset.csv")
# header = vData.first()
# rows = vData.filter(lambda x: x != header)

vdf = spark.read.format('com.databricks.spark.csv').csv(
    's3://cs643-wine/ValidationDataset.csv', header=True, sep=";")

parsedVData = vdf.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[:11])))

predictions = model.predict(parsedVData.map(lambda x: x.features))
Пример #37
0
# Class 18.0 precision = 0.0
# Class 18.0 recall = 0.0
# Class 18.0 F1 Measure = 0.0
# Class 19.0 precision = 0.0
# Class 19.0 recall = 0.0
# Class 19.0 F1 Measure = 0.0
# Class 20.0 precision = 0.0
# Class 20.0 recall = 0.0
# Class 20.0 F1 Measure = 0.0
 
  
training  = training_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray())) 
test  = test_random.rdd.map(lambda row: LabeledPoint(row['label'], row['raw_Features'].toArray()))
 
#======== RandomForest
rf_model = RandomForest.trainClassifier(training,21,{}, 50, seed=1000) 
rf_model.totalNumNodes()#402
 
 
## Compute raw scores on the test set
# predictionAndLabels = test.map(lambda lp: (float(rf_model.predict(lp.features)), lp.label))   #doesnot work
 
# an other way 
predictions = rf_model.predict(test.map(lambda x: x.features)) 
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
predictionAndLabels=labelsAndPredictions.map(lambda x: (x[1],x[0] ))  #infact , there is no need  to switch

metrics = MulticlassMetrics(predictionAndLabels) 
#metrics = MulticlassMetrics(labelsAndPredictions)  # the same as above

#predictionAndLabels – an RDD of (prediction, label) pairs.
#Preparing the training data 
from pyspark.mllib.regression import LabeledPoint
from numpy import array

data_raw = train_data.rdd.map(lambda x: LabeledPoint(x[12], array((x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8] ,x[9], x[10], x[11]), dtype=float)))
(trainingData, testData) = data_raw.randomSplit([0.7, 0.3])


# In[ ]:

#Training the random forest model
from pyspark.mllib.tree import RandomForest, RandomForestModel

model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={0:13, 1:13, 5:13, 11:14},
                                     numTrees=4, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=7, maxBins=15)


# In[ ]:


#Testing the trained model on the test data and evaluating error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

Пример #39
0
for country_from in country_list:
    for country_to in country_list:
        print("Country from: ", country_from, " Country to: ", country_to)
        try:
            df2 = df.filter(df.Country_from == country_from).filter(
                df.Country_to == country_to)
            df_temp = df2.select(df2.Scrap_time.cast("float"),'Airline1_Back','Airline2_There','Airline2_Back'\
                             ,'Airline1_There',df2.Days.cast("float"),df2.Journey_time.cast("float"), df2.Full_Price.cast("float"))

            for nazwa in nazwy:
                indexer = StringIndexer(inputCol=nazwa,
                                        outputCol=nazwa + "Index")
                df_temp = indexer.fit(df_temp).transform(df_temp)

            df_temp = df_temp.select('Airline1_BackIndex','Airline2_ThereIndex','Airline2_BackIndex','Airline1_ThereIndex','Scrap_time',\
                   'Days','Journey_time', 'Full_Price')
            transformed = transData(df_temp)

            test = transformed.rdd.map(lambda row: LabeledPoint(
                row['label'], row['features'].toArray()))
            model = RandomForest.trainRegressor(test,
                                                categoricalFeaturesInfo={},
                                                numTrees=30,
                                                featureSubsetStrategy="auto",
                                                impurity='variance',
                                                maxDepth=4,
                                                maxBins=32)

            model.save(sc, "modele/" + country_from + "_" + country_to)
        except:
            print("Puść jeszcze raz ", country_from, " ", country_to)
# MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/spam_docvecs")

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.

rr = RandomForest.trainClassifier(
    trainingData,
    numClasses=2,
    categoricalFeaturesInfo={},
    numTrees=3,
    featureSubsetStrategy="auto",
    impurity="gini",
    maxDepth=4,
    maxBins=32,
)

predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
posErr = (
    float(labelsAndPredictions.filter(lambda (v, p): v == 0.0 and v != p).count())
    / testData.filter(lambda lp: lp.label == 0.0).count()
)
negErr = (
    float(labelsAndPredictions.filter(lambda (v, p): v == 1.0 and v != p).count())
    / testData.filter(lambda lp: lp.label == 1.0).count()
)
Пример #41
0
}  # feature 1 has 53 categories, 0 ..to .. 52 (corresponding to week 1 .. 53)
# [(crimes, [beat, week, temp])]
# feature 0: beat
# feature 1: week
# feature 2: temp
# featuresDic = {} # for all continuous predictors

maxBins = max(
    len(beatsDic), len(weekDic)
)  # ecisionTree requires maxBins >= max categories in categorical features (304)

### Fit
model = RandomForest.trainRegressor(trainingData,
                                    categoricalFeaturesInfo=featuresDic,
                                    numTrees=10,
                                    featureSubsetStrategy="auto",
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=maxBins)
### Evalute
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
    .sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
# print(model.toDebugString())

### Compute R2
SSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) *
Пример #42
0
    labeled_data = labeled_indices.leftOuterJoin(keyfirst_train).map(
        lambda _: (_[1][1], _[0]))
    unlabeled_data = unlabeled_indices.leftOuterJoin(keyfirst_train).map(
        lambda _: (_[1][1], _[0]))

    print('labeled = ', labeled_indices.count(), ' unlabeled = ',
          unlabeled_indices.count())

    if unlabeled_indices.isEmpty():
        break

    n_estimators = 10
    model = RandomForest.trainClassifier(labeled_data.map(lambda _: _[0]),
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=n_estimators,
                                         featureSubsetStrategy="auto",
                                         impurity='gini')
    ''' accuracy test on testset here'''
    predictions = model.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1])

    n_unlabeled = unlabeled_data.count()

    rdd = sc.parallelize([])
    for tree in model._java_model.trees():
        predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))
        rdd = rdd.union(predX)
Пример #43
0
     features_modeled_train, features_categorical_indexed_vec_train)
 ## select the one-hot-encoded categorical features along with numerical features as well as label to contrust the modeling dataset
 df_train_modeling = df_train.select(features_modeled_train)
 ## df_train_modeling_rdd for mllib package
 df_train_modeling_rdd = df_train_modeling.map(
     lambda p: convert_sparsevec_to_vec_df(
         p, features_categorical_indexed_vec_index_train))
 df_train_modeling_rdd = df_train_modeling_rdd.map(
     lambda l: LabeledPoint(l[0], l[1:]))
 ################################################## 5: train random forest regression model
 ## random forest
 ## train model
 rfModel = RandomForest.trainRegressor(df_train_modeling_rdd,
                                       categoricalFeaturesInfo={},
                                       numTrees=100,
                                       featureSubsetStrategy="auto",
                                       impurity='variance',
                                       maxDepth=10,
                                       maxBins=32)
 # Predict on train data
 predictions = rfModel.predict(
     df_train_modeling_rdd.map(lambda l: l.features))
 ## Evaluation of the model
 predictionAndObservations = predictions.zip(
     df_train_modeling_rdd.map(lambda l: l.label))
 testMetrics = RegressionMetrics(predictionAndObservations)
 model_time = str(model_time[0][0])
 df_model_performance = sqlContext.createDataFrame(
     sc.parallelize(
         [[model_time, testMetrics.rootMeanSquaredError, testMetrics.r2]]),
     ["model_time", "RMSE", "R2"])
Пример #44
0
print("Number of training set rows: %d" % training_data.count())
print("Number of test set rows: %d" % test_data.count())

# COMMAND ----------

from pyspark.mllib.tree import RandomForest
from time import *

start_time = time()

model = RandomForest.trainClassifier(training_data,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     numTrees=25,
                                     featureSubsetStrategy="auto",
                                     impurity="gini",
                                     maxDepth=4,
                                     maxBins=32,
                                     seed=13579)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

# COMMAND ----------

predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(
    test_data.count())
        l2.extend([float(a[j])])
    test_features.append(l2)
    l = f.readline()

f.close()

# Random Forest
# C14 - C21
# Build the Model
rf_trees = [25, 50, 100, 150, 200, 250, 300]

for i in range(0, len(rf_trees), 1):
    num_rf_trees = rf_trees[i]

    # Build the Model
    model = RandomForest.trainClassifier(train_data, 2, {}, num_rf_trees)

    rf_train_predict_label = []
    rf_test_predict_label = []

    # Predict Labels
    for j in range(0, len(test_features), 1):
        p_l = model.predict(test_features[j])
        rf_test_predict_label.extend([p_l])

    for j in range(0, len(train_features), 1):
        p_l = model.predict(train_features[j])
        rf_train_predict_label.extend([p_l])

    # Append Labels
    appendColumn(ensemble_test, rf_test_predict_label)
Пример #46
0
# from pyspark.mllib.tree import RandomForest, RandomForestModel
# model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
#                                      numTrees=3, featureSubsetStrategy="auto",
#                                      impurity='gini', maxDepth=4, maxBins=32)
# evaluate(model,trainingData,testData)

# In[16]:

import pandas as pd
from pyspark.mllib.tree import RandomForest, RandomForestModel
for n in [1, 2, 4, 8, 16, 32, 64, 100]:
    start = time.time()
    model = RandomForest.trainClassifier(parsedData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=n,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=30,
                                         maxBins=32)
    taken = time.time() - start
    taken
    pd.DataFrame([[filename, taken, n]]).to_csv('rf_spark.txt',
                                                mode='a',
                                                index=False,
                                                header=False)
#from pyspark.mllib.classification import SVMWithSGD, SVMModel
#model = SVMWithSGD.train(trainingData, iterations=100)
#evaluate(model,trainingData,testData)

# Gradient Boosted Trees
# ------------------------------------------------------------------------------
# Step 5(a):
# Parameters for the Random Forest model
# ------------------------------------------------------------------------------
 
RANDOM_SEED = 10904
RF_NUM_TREES = 100
RF_MAX_DEPTH = 4
RF_MAX_BINS = 100
 
# ------------------------------------------------------------------------------
# Step 5(b):
# Training a Random Forest model on the dataset
# ------------------------------------------------------------------------------
 
model = RandomForest.trainClassifier(transformed_train_df, numClasses=2, categoricalFeaturesInfo={}, \
    numTrees=RF_NUM_TREES, featureSubsetStrategy="log2", impurity="entropy", \
    maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)
 
# ------------------------------------------------------------------------------
# Step 5©:
# Make predictions and compute accuracy
# ------------------------------------------------------------------------------
 
predictions = model.predict(transformed_test_df.map(lambda x: x.features))
labels_and_predictions = transformed_test_df.map(lambda x: x.label).zip(predictions)
model_accuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(transformed_test_df.count())
print("Model accuracy: %.3f%%" % (model_accuracy * 100))
 
 
# ------------------------------------------------------------------------------
# Step 5(d):
sc = spark.sparkContext

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/diamonds.data')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=123)

# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData,
                                     numClasses=9,
                                     categoricalFeaturesInfo={},
                                     numTrees=25,
                                     featureSubsetStrategy="auto",
                                     impurity='gini',
                                     maxDepth=15,
                                     maxBins=32,
                                     seed=123)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
result = testData.zip(predictions).collect()

# Print the predictions to output file
with open('machine_learning/results/predicted_cut.txt', 'w') as f:
    for i in result:
Пример #49
0
target_data = keyed_data.join(keyed_target)
labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(',')))

#map(lambda line: line.split(",")).map(lambda line: tuple((feature for feature in line)))

# Split the data into training and test sets (30% held out for testing)
print("Creating Training and Test Data Split")
(trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.

model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=8, maxBins=32)

# # Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count())
print('Total Accuracy = ' + str(testAccuracy))

# print('Learned regression forest model:')
# print(model.toDebugString())

# # Save and load model
Пример #50
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            os.removedirs(temp_dir)
        except OSError:
            pass
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for survived, record in sample_data:
        lp = LabeledPoint(survived, tuple(record))
        lparr.append(lp)

    for survived, record in test_data:
        lp = LabeledPoint(survived, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    classificationModel = RandomForest.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={},
                                                       numTrees=3)
    result = classificationModel.predict(test_data_rdd.map(lambda x: x.features))
    print classificationModel
    print classificationModel.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    print accuracy_score(actual_data,predicted_data)
    print(classificationModel)

    #for p in predicted_data:
    #    print p
    break
Пример #52
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]),
                                          iterations=10)
            LassoWithSGD.train(rdd,
                               initialWeights=array([1.0, 1.0]),
                               iterations=10)
            RidgeRegressionWithSGD.train(rdd,
                                         initialWeights=array([1.0, 1.0]),
                                         iterations=10)
        except ValueError:
            self.fail()
Пример #53
0
 def train(self, trainingData):
     self.model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                  numTrees=10, featureSubsetStrategy="auto",
                                  impurity='gini', maxDepth=4, maxBins=32)
Пример #54
0
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForestModel
from pyspark.mllib.evaluation import MulticlassMetrics
from prettytable import PrettyTable


sc = SparkContext()
spark = SparkSession(sc)
inputDF = spark.read.csv('TrainingDataset.csv',
                         header='true', inferSchema='true', sep=';')
featureColumns = [c for c in inputDF.columns if c != 'quality']

transformed_df = inputDF.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

model = RandomForest.trainClassifier(transformed_df, numClasses=10, categoricalFeaturesInfo={
}, numTrees=50, maxBins=64, maxDepth=20, seed=33)
# model.save(sc,"s3://wineprediction/model_created.model")

validDF = spark.read.csv(
    '/testdata/*.csv', header='true', inferSchema='true', sep=';')

datadf = validDF.rdd.map(lambda row: LabeledPoint(
    row[-1], Vectors.dense(row[0:-1])))

predictions = model.predict(datadf.map(lambda x: x.features))

labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(
    lambda x: x[0] == x[1]).count() / float(datadf.count())

Пример #55
0
def main():
    options = parse_args()

    sc = SparkContext(appName="PythonRandomForestClassificationExample")

    pm.init(sc)

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, options.data_file)
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    model = RandomForest.trainClassifier(trainingData,
                                         numClasses=options.num_classes,
                                         categoricalFeaturesInfo={},
                                         numTrees=options.num_trees,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=options.max_depth,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())

    print("Using mlops to report statistics")

    # Adding multiple points (to see a graph in the ui)
    pm.set_stat("numTrees", options.num_trees, st.TIME_SERIES)
    pm.set_stat("numClasses", options.num_classes, st.TIME_SERIES)
    pm.set_stat("maxDepth", options.max_depth, st.TIME_SERIES)
    pm.set_stat("testError", testErr, st.TIME_SERIES)

    # TODO: this should be removed once we have better tests for mlops
    pm.set_stat("stat1", 1.0, st.TIME_SERIES)
    pm.set_stat("stat1", 2.0, st.TIME_SERIES)
    pm.set_stat("stat1", 3.0, st.TIME_SERIES)
    pm.set_stat("stat1", 4.0, st.TIME_SERIES)
    pm.set_stat("stat1", 5.0, st.TIME_SERIES)
    pm.set_stat("stat1", 6.0, st.TIME_SERIES)
    pm.set_stat("stat1", 7.0, st.TIME_SERIES)
    pm.set_stat("stat1", 8.0, st.TIME_SERIES)

    # String
    pm.set_stat("stat2", "str-value", st.TIME_SERIES)

    # Vec
    pm.set_stat("statvec", [4.5, 5.5, 6.6], st.TIME_SERIES)

    list_of_strings = []
    for x in range(1, 10000):
        list_of_strings.append("{},{},{}".format(x, x + 1, x + 2))

    rdd_of_str = sc.parallelize(list_of_strings)
    rdd = rdd_of_str.map(lambda line: Vectors.dense(line.split(",")))

    # Histograms and any input stats
    pm.set_stat("input", rdd, st.INPUT)

    print("Done reporting statistics")
    # Save and load model
    model.save(sc, options.output_model)
    print("Done saving model to {}".format(options.output_model))
    sameModel = RandomForestModel.load(sc, options.output_model)
    # $example off$

    sc.stop()
    pm.done()
    test_lp_arr = []
    sample_data = all_data[train_indexes]
    test_data = all_data[test_indexes]

    for survived, record in sample_data:
        lp = LabeledPoint(survived, tuple(record))
        lparr.append(lp)

    for survived, record in test_data:
        lp = LabeledPoint(survived, tuple(record))
        test_lp_arr.append(lp)

    training_data = sc.parallelize(lparr).cache()
    test_data_rdd = sc.parallelize(test_lp_arr).cache()

    classificationModel = RandomForest.trainClassifier(
        training_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=3)
    result = classificationModel.predict(
        test_data_rdd.map(lambda x: x.features))
    print classificationModel
    print classificationModel.toDebugString()
    print "==============================="
    predicted_data = result.collect()
    actual_data = test_data_rdd.map(lambda x: float(x.label)).collect()

    print mean_absolute_error(actual_data, predicted_data)
    print accuracy_score(actual_data, predicted_data)
    print(classificationModel)

    #for p in predicted_data:
    #    print p
    break
Пример #57
0
    # categorical = range(0,30) + range(35,39) + range(41,46) + range(48,57)
    # data.cache()
    # mappings = [get_mapping(data, i) for i in categorical]


    labelpoints = data.map(lambda x: LabeledPoint(x[-1], x[:-1]))

    return labelpoints

data = label_points(data_raw)
training, testing = data.randomSplit([0.5, 0.5], 0)


model = RandomForest.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={},
                                     numTrees=1000, featureSubsetStrategy="auto",
                                     impurity='gini', maxBins=32)

predictions = model.predict(testing.map(lambda x: x.features))
labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions)
accuracy = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testing.count())
print accuracy


#
# # https://books.google.com/books?id=syPHBgAAQBAJ&pg=PA166&lpg=PA166&dq=categorical+variables+labeledpoint+pyspark&source=bl&ots=X9VyTR348v&sig=cMf8rZlpbdWcyCl2jSPNU1Var6k&hl=en&sa=X&ved=0ahUKEwjPpofhh8XMAhVI1WMKHXoqCio4ChDoAQgbMAA#v=onepage&q=categorical%20variables%20labeledpoint%20pyspark&f=false
# # Page 166
# def get_mapping(rdd, idx):
#     return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
#
# # cat_len = sum(map(len, mappings))
Пример #58
0
testFinal.count()


testFinal.collect()


#For Getting the threshold limit, Using Train dataset

(training1, training2) = trainFinal.randomSplit([0.7, 0.3])

training1.collect()


model_1 = RandomForest.trainRegressor(training1, categoricalFeaturesInfo={},
                                    numTrees=3, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)
model_2 = GradientBoostedTrees.trainRegressor(training1,
                                            categoricalFeaturesInfo={}, numIterations=3)
model_3 = DecisionTree.trainRegressor(training1, categoricalFeaturesInfo={},
                                    impurity='variance', maxDepth=5, maxBins=32)


predictionsRFTrain = model_1.predict(training1.map(lambda x: x.features))
predictionsGBTTrain = model_2.predict(training1.map(lambda x: x.features))
predictionsDTTrain = model_3.predict(training1.map(lambda x: x.features))

predictionsRFTrain.collect()

predictionsGBTTrain.collect()
Пример #59
0
    numFeatures = parsed_data.map(lambda x:-1 if x[1].size==0 else x[1][-1]).reduce(max)+1
labeled_data = parsed_data.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1],x[2])))

unbalance_test = data_ans_0827.map(feature_char_to_num).cache()
l_unbal_te = unbalance_test.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))


#splite data to trainData and testData
(trianData, testData) = labeled_data.randomSplit([0.9, 0.1])

len_list = [len(i) for i in fe]
col_na_l = [i-1 for i in col_na]  #because slice out the first data in vector [1:-2]
col_na_l = [i-1 for i in col_na_l if i >= 83]   #for drop out the 85th col
features_dict = dict(zip(col_na_l, len_list))  #feature dict eg. {1:3, 5:8}
model = RandomForest.trainClassifier(trianData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=50, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=5, maxBins=32)


# Evaluate model on test instances and compute test error
predictions = model.predict(l_unbal_te.map(lambda x: x.features))
labelsAndPredictions = l_unbal_te.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(l_unbal_te.count())
print('Test Error = ' + str(testErr) +"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
true_positive = labelsAndPredictions.filter(lambda (v,p):v==p and p==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==1).count())
print "true_positive", true_positive, "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
f_true = labelsAndPredictions.filter(lambda (v,p):v==p and v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):p==1).count())
print "precision=TP/(TP+Fp)", f_true, "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
print "1/0",labelsAndPredictions.filter(lambda (v,p):v==1).count()/float(labelsAndPredictions.filter(lambda (v,p):v==0).count()), "##############################################################################################"
#print "False", labeled_data.filter(lambda p:p.label==0).count(), "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2"
#print "Positive",  labeled_data.filter(lambda p:p.label==1).count(),"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
Пример #60
0
    split = Tokenizer(inputCol="text", outputCol="words")
    wordsData = split.transform(train_hive_info)
    my_print('分词完成.......')

    # 增加TF特征列
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=2**10)
    TF_data = hashingTF.transform(wordsData)
    my_print('TF特征构造完成.......')

    # 增加IDF特征列
    idf = IDF(inputCol="rawFeatures", outputCol="features").fit(TF_data)
    final_input_data = idf.transform(TF_data)
    my_print('IDF特征构造完成.......')

    train_rdd = final_input_data.select("label", "features") \
        .rdd.map(lambda (label, features): (LabeledPoint(label, features.toArray())))

    if model_name == 'LogisticRegression':
        model = LogisticRegressionWithLBFGS.train(train_rdd, numClasses=10)
        model.save(sc, model_path)

    elif model_name == 'NaiveBayes':
        model = NaiveBayes.train(train_rdd)
        model.save(sc, model_path)

    else:
        model = RandomForest.trainClassifier(train_rdd, 10, {}, 10, seed=42)
        model.save(sc, model_path)
    my_print('模型训练完成.......')