Пример #1
0
def evaluate_dt(train,test,maxDepth,maxBins):
    model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins)
    preds = model.predict(test.map(lambda p:p.features))
    actual = test.map(lambda p:p.label)
    tp = actual.zip(preds)
    rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean())
    return rmsle
Пример #2
0
    def trainClassifier(self):
        # get the current time
        current = time()

        # get the tags
        tags    = self.tags
        numeric = self.numeric
        x       = self.x
        y       = self.y

        # get the training data
        training_data = self.training_labeled

        # start training the tree model
        self.tree_model = DecisionTree.trainClassifier(
                            training_data,
                            numClasses=4,
                            categoricalFeaturesInfo={0 : len(tags), 1 : len(numeric), 2 : len(x), 3 : len(y)},
                            impurity="gini",
                            maxDepth=5,
                            maxBins=1000)

        print self.tree_model

        # total time
        total = time() - current

        print "Classifier trained in {} seconds.".format(round(total, 3))

        # start evaluating the model
        self.evaluate()
Пример #3
0
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)

    # Train the models
    decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
                                         impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)

    # Test the model
    testDecisionTree(decisionTreeModel, parallelized_test_set)
Пример #4
0
def generateDecisionTree():
    if os.path.exists(DT_PATH):
        print("DT_PATH Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)

    model = DecisionTree.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ', str(testErr))

    print('Learned classification tree model:')
    print(model.toDebugString())

    modelStatistics(labelsAndPredictions)

    # Save and load model
    model.save(sc, DT_PATH)
    print("Decision Tree model saved!")
def decisionTree(trainingRDD, trainingRDDHashed, testRDDHashed, testRDD):
    # Get size of RDD
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Train the Decision Tree Model
    trainedModel = DecisionTree.trainClassifier(
        trainingRDD,
        numClasses=2,
        categoricalFeaturesInfo={},
        impurity='gini',
        maxDepth=2,
        maxBins=3)
    # Test the Model on the Training Set
    predictions = trainedModel.predict(trainingRDD.map(lambda x: x.features))
    labelsAndPredictions = trainingRDD.map(
        lambda lp: lp.label).zip(predictions).countByValue()
    # Map to Dictionary for obtaining Results
    resultsValidation = defaultdict(lambda: 0, labelsAndPredictions)
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Get F-Score and Accuracy Value
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    # Test the Model on the Test Set
    predictions = trainedModel.predict(testRDD.map(lambda x: x.features))
    labelsAndPredictions = testRDD.map(
        lambda lp: lp.label).zip(predictions).countByValue()
    # Map to Dictionary for obtaining Results
    resultsTest = defaultdict(lambda: 0, labelsAndPredictions)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Decision Tree')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
Пример #6
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
            RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
        except ValueError:
            self.fail()
Пример #7
0
def trainModel(trainingData):
	print '\nTraining Decision Tree model started'
	Utils.logTime()

	model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5,maxBins=32)
	print '\nTraining Decision Tree model finished'
	Utils.logTime()
	return model
Пример #8
0
def RunDecisionTree(tf):
	rdd = tf.map(parseAsLabeledPoints)
	train, test = rdd.randomSplit([.8, .2])
	model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=100)
	predictions = model.predict(train.map(lambda x: x.features))
	labelsAndPredictions = train.map(lambda lp: lp.label).zip(predictions)
	trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
	print('Training Error = ' + str(trainErr))
Пример #9
0
 def train(self, num_classes=2, categorical_features=None, max_depth=5):
     categorical_features = categorical_features or {}
     model = DecisionTree.trainClassifier(
         self._labeled_feature_vector_rdd(),
         numClasses=num_classes, 
         categoricalFeaturesInfo=categorical_features,
         maxDepth=max_depth)
     return DecisionTreeModel(model, self.feature_cols)
Пример #10
0
def RunDecisionTree(tf):
	rdd = tf.map(parseAsLabeledPoints)
	train, test = rdd.randomSplit([.8, .2])
	numCat = len(genCats)
	model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},
		impurity='gini', maxDepth=5, maxBins=100)
	# Evaluate model on training instances and compute training error
	predictions = model.predict(test.map(lambda x: x.features))
	labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
	trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
	print('Accuracy of decision tree = ', 1-trainErr)
	print('Training Error = ' + str(trainErr))
Пример #11
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #12
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
Пример #13
0
def regression(sc, sample):

    traindata = sc.parallelize(sample)
    traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
    testdata = [8.2]
    #####
#    linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
#    prediction = linear_model.predict(testdata)
#    print prediction


    #####
    decision_model = DecisionTree.trainRegressor(traindata,{})
    prediction = decision_model.predict(testdata)
    print prediction
def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins):
	
	decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={},
														impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins)


	predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
	trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions)

	predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
	testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions)

	return decisionTreeModel
Пример #15
0
def classify(sc, sample):
    def ff(x):
        newsample = []
        nl = ["rainy","sad","lack"]
        ml = ["cloudy","soso","enough"]
        pl = ["sunny","happy","most"]
        for i in x:
            if i in nl:
                newsample.append(0)
            elif i in ml:
                newsample.append(1)
            elif i in pl:
                newsample.append(2)
        return newsample

    f = lambda x:1 if x=="yes" else 0
    traindata = sc.parallelize(sample).map(lambda x:(ff(x[0]),f(x[1]))) 
    traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
    testdata = traindata.first()
    print testdata

    ######
#    print "logistic"
#    lrModel = LogisticRegressionWithSGD.train(traindata, 10)
#    prediction = lrModel.predict(testdata.features)
#    print prediction
    

    #####
#    print "svm"
#    svmModel = SVMWithSGD.train(traindata, 10)
#    prediction = svmModel.predict(testdata.features)
#    print prediction
#
#
#    ####
#    print "naive bayes"
#    nbModel = NaiveBayes.train(traindata)
#    prediction = nbModel.predict(testdata.features)
#    print prediction
#
#
#    ####
    print "decesion tree"
    detreeModel = DecisionTree.trainClassifier(traindata, 2, {})
    prediction = detreeModel.predict(testdata.features)
    print prediction
def main(input_file):

    sc = pyspark.SparkContext(appName="DecisionTree")

    data = MLUtils.loadLabeledPoints(sc, input_file)

    trainingData, testData = data.randomSplit([0.70, 0.3])
    # Cache in memory for faster training
    trainingData.cache()

    model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
                 categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    # print tree_model.toDebugString()
    print ""
    print ""
    print "Test Erros: {}".format(round(testErr,4))
	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile):
	filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect()
	clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect()
	clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec))
	labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1]))

	trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3])

	decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum,
														categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins)

	predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
	trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions)

	predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
	testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions)

	return decisionTreeModel
Пример #19
0
def create_model(name, training):
    if name == 'logistic':
        print_box()
        print "Logistic Regression Model"
        print_box()
        model = LogisticRegressionWithLBFGS.train(training)
    elif name == 'tree':
        print_box()
        print "Decision Tree Model"
        print_box()
        model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
    elif name == 'rf':
        print_box()
        print "Random Forest Model"
        print_box()
        model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                    numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)

    return model
Пример #20
0
def trainOptimalModel(trainingData, testData):
	print "\nTraining optimal Decision Tree model started!"
	Utils.logTime()

	impurityVals = ['gini', 'entropy']
	maxDepthVals = [3,4,5,6,7]
	maxBinsVals = [8,16,32]

	optimalModel = None
	optimalMaxDepth = None
	optimalImpurity = None
	optimalBinsVal = None
	minError = None

	try:
		for curImpurity in impurityVals:
			for curMaxDepth in maxDepthVals:
				for curMaxBins in maxBinsVals:
					model = DecisionTree.trainClassifier(trainingData, 
														 numClasses=2, 
														 categoricalFeaturesInfo={}, 
														 impurity=curImpurity, 
														 maxDepth=curMaxDepth,
														 maxBins=curMaxBins)
					testErr, PR, ROC = Evaluation.evaluate(model, testData)
					if testErr < minError or not minError:
						minError = testErr
						optimalImpurity = curImpurity
						optimalMaxDepth = curMaxDepth
						optimalBinsVal = curMaxBins
						optimalModel = model
	except:
		msg = "\nException during model training with below parameters:"
		msg += "\timpurity: " + str(curImpurity)
		msg += "\tmaxDepth: " + str(curMaxDepth)
		msg += "\tmaxBins: " + str(curMaxBins)
		Utils.logMessage(msg)

	logMessage(optimalModel, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
	return optimalModel 
Пример #21
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Пример #22
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Пример #23
0
sc = SparkContext()

result = {1.0: 'yes', 0.0: 'no'}

# 机器学习实战第三章中的鱼类归属数据源
data = [
    LabeledPoint(1, [1, 1]),
    LabeledPoint(1, [1, 1]),
    LabeledPoint(0, [1, 0]),
    LabeledPoint(0, [0, 1]),
    LabeledPoint(0, [0, 1])
]

rdd = sc.parallelize(data)

print '------------------------------------'
print type(rdd), dir(rdd)
print rdd.collect()
print '------------------------------------'
model = DecisionTree.trainClassifier(rdd, 3, {})

# print(model)

print '********************************************************'
print(model.toDebugString())
print "test [1,0]: %s" % (result[model.predict(array([1, 0]))])
print "test [1,1]: %s" % (result[model.predict(array([1, 1]))])
print "test [0,0]: %s" % (result[model.predict(array([0, 0]))])
print '********************************************************'
sc.stop()
Пример #24
0
conf = SparkConf().setAppName(appName).setMaster("local[2]") #at least 2
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)

# separate the classification label and the actual data
def parsePoint(line):
  values = [float(x) for x in line.split(',')]
  return LabeledPoint(values[0], values[1:])

# training the model
data = sc.textFile(learning_data_file)
parsedData = data.map(parsePoint)

model = (DecisionTree.trainClassifier(parsedData,
                                      numClasses=2,
                                      categoricalFeaturesInfo={2:9},
                                      impurity='gini',
                                      maxDepth=30))
"""
model = (RandomForest.trainClassifier(parsedData,
                                      numClassesForClassification=2,
                                      numTrees=6,
                                      categoricalFeaturesInfo={2:10},
                                      impurity='gini',
                                      maxDepth=30))
"""
print "====================== model trained ======================"

# streaming and parsing text
lines = ssc.socketTextStream(HOST, QUERY_PORT)
vectors = lines.flatMap(lambda x:x.split(',')).map(lambda l:float(l))
labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr)) ## 0.555395278766

############################ Decision TREE ##############################

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils


def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[-1], values[0:9])

data = sc.textFile("/Users/mac/Desktop/USF/MSAnalytics/Spring1/ML2/ML Project/plays.csv")
header = data.first()
data = data.filter(lambda x: x != header)
parsedData = data.map(parsePoint)

model = DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=30, maxBins=100)

# Evaluate model on training instances and compute training error
predictions = model.predict(parsedData.map(lambda x: x.features))
labelsAndPredictions = parsedData.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedData.count()) #0.09
print('Training Error = ' + str(trainErr))
print('Learned classification tree model:')
print(model)
        row = []
        while column_number < 200:
            if array[m + column_number] == 1:
                row.append(x[column_number])
                index += 1
            column_number += 1

        return LabeledPoint(train_labels_array[j], row)

    trainingData = train_data.map(f)
    test = test_data.map(g)

    # Train model using DECISION TREE
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=12613,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=2,
                                         maxBins=32)

    predictions = model.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)

    # Obtain the accuracy
    test_accuracy = labelsAndPredictions.filter(
        lambda lp: lp[0] == lp[1]).count() / float(test.count())
    accuracies.append(test_accuracy)
    m += 200

# LAST individual
individual = individual4
z = 0
Пример #27
0
# Split each line into a list based on the comma delimiters
csvData = rawData.map(lambda x: x.split(","))

# Convert these lists to LabeledPoints
trainingData = csvData.map(createLabeledPoints)

# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print ('Hire prediction:')
results = predictions.collect()
for result in results:
    print result

# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())
Пример #28
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        dimension = dimension_columns[0]
        all_dimensions = self._dimension_columns
        all_measures = self._measure_columns
        cat_feature_info = []
        columns_without_dimension = list(x for x in all_dimensions
                                         if x != dimension)
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        for column in all_dimensions:
            mapping_dict[column] = dict(
                enumerate(
                    self._data_frame.select(column).distinct().rdd.map(
                        lambda x: str(x[0])).collect()))
        # for c in mapping_dict:
        #     name = c
        #     reverseMap = {v: k for k, v in mapping_dict[c].iteritems()}
        #     udf = UserDefinedFunction(lambda x: reverseMap[x], StringType())
        #     self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns])

        # converting spark dataframe to pandas for transformation and then back to spark dataframe
        pandasDataFrame = self._data_frame.toPandas()
        for key in mapping_dict:
            pandasDataFrame[key] = pandasDataFrame[key].apply(
                lambda x: 'None' if x == None else x)
            reverseMap = {v: k for k, v in mapping_dict[key].items()}
            pandasDataFrame[key] = pandasDataFrame[key].apply(
                lambda x: reverseMap[x])
        # sqlCtx = SQLContext(self._spark)
        self._data_frame = self._spark.createDataFrame(pandasDataFrame)
        self._mapping_dict = mapping_dict
        for c in columns_without_dimension:
            cat_feature_info.append(
                self._data_frame.select(c).distinct().count())
        if len(cat_feature_info) > 0:
            max_length = max(cat_feature_info)
        else:
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        dimension_classes = self._data_frame.select(
            dimension).distinct().count()
        self._data_frame = self._data_frame[[dimension] +
                                            columns_without_dimension +
                                            all_measures]
        data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
        (trainingData, testData) = data.randomSplit([1.0, 0.0])
        # TO DO : set maxBins at least equal to the max level of categories in dimension column
        model = DecisionTree.trainClassifier(
            trainingData,
            numClasses=dimension_classes,
            categoricalFeaturesInfo=cat_feature_info,
            impurity='gini',
            maxDepth=3,
            maxBins=max_length)
        output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame)
        self.generate_probabilities(decision_tree, dimension)
        # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree)
        # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_params(decision_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability)

        return decision_tree_result
Пример #29
0
print "Decision Tree feature vector length: " + str(
    len(first_point_tree.features))

# In[167]:

from pyspark.mllib.tree import DecisionTree

#from the RDD sample 20% for training and rest for test
records_tree_with_idx = data_tree.zipWithIndex().map(lambda (k, v): (v, k))
test_tree_idx = records_tree_with_idx.sample(False, 0.2, 42)
training_tree_idx = records_tree_with_idx.subtractByKey(test_tree_idx)

test_tree = test_tree_idx.map(lambda (idx, p): p)
training_tree = training_tree_idx.map(lambda (idx, p): p)

model_tree = DecisionTree.trainRegressor(training_tree, {})

preds_tree = model_tree.predict(test_tree.map(lambda p: p.features))
actual_tree = test_tree.map(lambda p: p.label)
true_vs_predicted_tree = actual_tree.zip(preds_tree)

print "Decision Tree predictions: " + str(true_vs_predicted_tree.take(5))
print "Decision Tree depth: " + str(model_tree.depth())
print "Decision Tree number of nodes: " + str(model_tree.numNodes())

# In[177]:

mse_tree = true_vs_predicted_tree.map(lambda
                                      (t, p): squared_error(t, p)).mean()
mae_tree = true_vs_predicted_tree.map(lambda (t, p): abs_error(t, p)).mean()
Пример #30
0
    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'carbon2.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=5,
                                         categoricalFeaturesInfo={
                                             0: 5,
                                             1: 5
                                         },
                                         impurity='entropy',
                                         maxDepth=5,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
Пример #31
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        self._target_dimension = dimension_columns[0]
        dimension = self._target_dimension

        #####Look into it for Issue 947#################
        max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL
        # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [
            dim for dim in self._dimension_columns
            if self._metaParser.get_num_unique_values(dim) <= max_num_levels
        ]
        all_measures = self._measure_columns
        if self._pandas_flag:
            self._data_frame = self._data_frame[all_dimensions + all_measures]
        cat_feature_info = []
        columns_without_dimension = [
            x for x in all_dimensions if x != dimension
        ]
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        decision_tree_result.set_freq_distribution(
            self._metaParser.get_unique_level_dict(self._target_dimension),
            self._important_vars)
        if self._pandas_flag:
            try:
                all_dimensions.remove(dimension)
            except:
                pass
            actual_cols = list(self._data_frame.columns)
            print(actual_cols)
            self._data_frame = pd.get_dummies(self._data_frame,
                                              columns=all_dimensions)
            after_dummy_cols = list(self._data_frame.columns)

            def Diff(li1, li2):
                return (list(
                    list(set(li1) - set(li2)) + list(set(li2) - set(li1))))

            decision_tree_result.dummy_cols = [
                Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)),
                all_dimensions
            ]

        all_dimensions.append(dimension)  #this has been done for scoring error
        if self._pandas_flag:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, [dimension], self._pandas_flag)
        else:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, all_dimensions, self._pandas_flag)
        if self._pandas_flag:
            print(self._data_frame.head(1))
        else:
            print(self._data_frame.show(1))
        # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'}
        standard_measure_index = {
            0.0: 'Low',
            1.0: 'Below Average',
            2.0: 'Average',
            3.0: 'Above Average',
            4.0: 'High'
        }
        for measure in all_measures:
            mapping_dict[measure] = standard_measure_index

        for k, v in list(mapping_dict.items()):
            temp = {}
            for k1, v1 in list(v.items()):
                self._alias_dict[v1.replace(",", "")] = v1
                temp[k1] = v1.replace(",", "")
            mapping_dict[k] = temp
        self._mapping_dict = mapping_dict
        if not self._pandas_flag:

            for c in columns_without_dimension:
                if self._pandas_flag:
                    cat_feature_info.append(len(self._data_frame[c].unique()))
                else:
                    cat_feature_info.append(
                        self._data_frame.select(c).distinct().count())
            for c in all_measures:
                cat_feature_info.append(5)
            columns_without_dimension = columns_without_dimension + all_measures
            all_measures = []
            if len(cat_feature_info) > 0:
                max_length = max(cat_feature_info)
            else:
                max_length = 32
        else:
            decision_tree_result.mappingdict = mapping_dict[dimension]
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        if self._pandas_flag:
            dimension_classes = len(self._data_frame[dimension].unique())
        else:
            dimension_classes = self._data_frame.select(
                dimension).distinct().count()
        if not self._pandas_flag:
            self._data_frame = self._data_frame[[dimension] +
                                                columns_without_dimension +
                                                all_measures]
        print("=" * 200)
        # print self._data_frame.rdd.first()
        print("numClasses", dimension_classes)
        print("maxDepth", self._maxDepth)
        decision_tree_result._maxDepth = self._maxDepth
        print("maxBins", max_length)
        print("=" * 200)
        if self._pandas_flag:
            self._data_frame.columns = [
                re.sub('\W+', '_', col.strip())
                for col in self._data_frame.columns
            ]
            x = self._data_frame.drop(dimension, axis=1)
            y = self._data_frame[dimension]
            #tle = LabelEncoder()
            #y = tle.fit_transform(y)
            for i in x.columns:
                x[i] = x[i].fillna(x[i].mode()[0])
            model = DecisionTreeClassifier(criterion='gini',
                                           max_depth=self._maxDepth,
                                           random_state=42)
            model = model.fit(x, y)
            output_result = self.tree_to_code(model, list(x.columns))
            output_result = list(map(lambda x: x.strip(), output_result))
        else:
            data = self._data_frame.rdd.map(
                lambda x: LabeledPoint(x[0], x[1:]))
            (trainingData, testData) = data.randomSplit([1.0, 0.0])
            # TO DO : set maxBins at least equal to the max level of categories in dimension column
            # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length)
            # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables
            #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info
            model = DecisionTree.trainClassifier(
                trainingData,
                numClasses=dimension_classes,
                categoricalFeaturesInfo=cat_feature_info,
                impurity='gini',
                maxDepth=self._maxDepth,
                maxBins=max_length)
            output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame,
                                       self._pandas_flag)
        self._new_tree = self.generate_new_tree(decision_tree)
        node_list = self.node_name_extractor(self._new_tree)
        node_list = list(self.flatten(node_list))
        correct_count_list = [i[0] for i in self._count_list]
        tree_dict = dict(list(zip(node_list, correct_count_list)))
        self._new_tree = self.wrap_tree(self._new_tree, tree_dict)
        self._path_dict = self.path_dict_creator(node_list, self._new_tree)
        print("===" * 40)
        decision_tree_result.set_params(self._new_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability, self._path_dict)
        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["script"] *
            self._scriptStages["treegeneration"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "treegeneration",\
                                    "info",\
                                    self._scriptStages["treegeneration"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        return decision_tree_result
    # Append Labels
    appendColumn(ensemble_test, rf_test_predict_label)
    appendColumn(ensemble_train, rf_train_predict_label)

# Decision Trees
# C13 - C21
# Build the Model
max_depth = [5, 10, 15, 20]

for i in range(0, len(max_depth), 1):
    m_depth = max_depth[i]

    # Build the Model
    model = DecisionTree.trainClassifier(train_data,
                                         10, {},
                                         impurity='gini',
                                         maxDepth=m_depth)

    rf_train_predict_label = []
    rf_test_predict_label = []

    # Predict Labels
    for j in range(0, len(test_features), 1):
        p_l = model.predict(test_features[j])
        rf_test_predict_label.extend([p_l])

    for j in range(0, len(train_features), 1):
        p_l = model.predict(train_features[j])
        rf_train_predict_label.extend([p_l])

    # Append Labels
Пример #33
0
#exec(open("./doweathclass_dectree.py").read())

# ---------------- now try decision tree ------------
from pyspark.mllib.tree import DecisionTree
dt_model = DecisionTree.trainClassifier(datax_rdd,
                                        2, {},
                                        impurity='entropy',
                                        maxDepth=3,
                                        maxBins=32,
                                        minInstancesPerNode=2)

#maxDepth and maxBins
#{} could be categorical feature list,
# To do regression, have no numclasses,and use trainRegression function
print(dt_model.toDebugString())

#results in this:
#DecisionTreeModel classifier of depth 3 with 9 nodes
#  If (feature 1 <= 0.0)
#   If (feature 4 <= 80.0)
#    If (feature 3 <= 68.0)
#     Predict: 0.0
#    Else (feature 3 > 68.0)
#     Predict: 1.0
#   Else (feature 4 > 80.0)
#    If (feature 0 <= 0.0)
#     Predict: 0.0
#    Else (feature 0 > 0.0)
#     Predict: 0.0
#  Else (feature 1 > 0.0)
#   Predict: 1.0
Пример #34
0
# In[ ]:

# In[53]:

(trainingData, testData) = fdata.randomSplit([0.8, 0.2])

# Use the decision tree classifier to train the model

# In[54]:

from pyspark.mllib.tree import DecisionTree

# In[55]:

model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=3,
                                     categoricalFeaturesInfo={})

# In[56]:

predictions = model.predict(testData.map(lambda row: row.features))

# Create Confusion Matrix to evaluate the accuracy of the model

# We create a matrix containing the test labels as a first column (real values) and predicted values as second column

# In[57]:

predictionsAndLabels = testData.map(
    lambda labeledpoint: labeledpoint.label).zip(predictions)
    # Load data.
    dataPath = 'train_svm'# 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)
    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
                                         categoricalFeaturesInfo=categoricalFeaturesInfo)
    # Print learned tree and stats.
    print origToNewLabels
    print "Trained DecisionTree for classification:"
    print "  Model numNodes: %d" % model.numNodes()
    print "  Model depth: %d" % model.depth()
    print "  Training accuracy: %g" % getAccuracy(model, reindexedData)
    if model.numNodes() < 20:
        print model.toDebugString()
    else:
        print model
#    testdata = MLUtils.loadLibSVMFile(sc, 'test_svm2')
    data = numpy.genfromtext('test_svm2', delimiter=',')
#reuben
    rdd = sc.parallelize(data)
    model.predict(rdd).collect()
def train_validate_test_rpart():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406
    
    sample_from_benign = benign.sample(False, 50000/n_benign)
    pat_proc = anom.unionAll(sample_from_benign)
    print("pat_proc.count() = " + str(pat_proc.count())) #99,227
    
    all_columns = pat_proc.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]
    
    #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. 
       #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. 
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    pat_proc = pat_proc.rdd
    print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = pat_proc.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    #Populate the actual categoricalFeaturesInfo dictionary
    cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()])
    procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))])
    cat_features_info = dict(cat_features_info.items() + procedure_features_info.items())
    
    t0 = time()
    model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) 
    #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all".
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds.
    print(model)
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds
    
    labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label
    test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size)
    fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() 
    fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272.
    #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018
    print model.toDebugString()
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return model 
Пример #37
0
    #    print('\n== ACCURACY BAYES : ', accuracy_bayes , '==')
    #
    #    file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n")
    #    file.write('\n-> ACCURACY BAYES : ' + str(accuracy_bayes) + '\n')
    #

    print("\n===================================================== ")
    print("=================== DECISION TREE =================== ")
    print("===================== (Entropy) ===================== ")
    print("=====================================================\n")

    print("\n=================== Training ================== \n")

    model_decision_tree_entropy = DecisionTree.trainClassifier(
        training,
        categoricalFeaturesInfo={},
        impurity="entropy",
        maxDepth=5,
        numClasses=2)
    print("Done : DT entropy training")

    print("\n========= Test on Brexit labeled data ========= ")

    #decision tree entropy
    labeled_prediction_entropy = test_tlabels_brexit.zip(
        model_decision_tree_entropy.predict(tfidf_test_brexit)).map(
            lambda x: {
                "actual": x[0],
                "predicted": x[1]
            })
    accuracy_entropy = 1.0 * labeled_prediction_entropy.filter(
        lambda doc: doc["actual"] == doc['predicted']).count(
Пример #38
0
    attack = 0.0
    if len(line_split) >= 9 and line_split[9] == 'title':
        attack = 1.0

    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))


training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(
    training_data,
    numClasses=2,
    categoricalFeaturesInfo={0: len(protocols)},
    impurity='gini',
    maxDepth=4,
    maxBins=100)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt, 3))

predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(
    test_data.count())
tt = time() - t0
Пример #39
0
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1, 42)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p): p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make decision tree model
dt_model = DecisionTree.trainRegressor(train_data, {})

# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())


def squared_error(actual, pred):
    return (pred - actual)**2


def squared_log_error(pred, actual):
Пример #40
0
help(LinearRegressionWithSGD.train)
help(DecisionTree.trainRegressor)

# ## Train a Regression Model on the Bike Sharing Dataset

linear_model = LinearRegressionWithSGD.train(data,
                                             iterations=10,
                                             step=0.1,
                                             intercept=False)
true_vs_predicted = data.map(lambda p:
                             (p.label, linear_model.predict(p.features)))
print("Linear Model predictions: " + str(true_vs_predicted.take(5)))

# we pass in an mepty mapping for categorical feature size {}
dt_model = DecisionTree.trainRegressor(data_dt, {})  #여기서 에러 뜨네...
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print("Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)))
print("Decision Tree depth: " + str(dt_model.depth()))
print("Decision Tree number of nodes: " + str(dt_model.numNodes()))

# 자 이제, linear regression/decisiontree 의 잔차 제곱을 통해 성능을 비교해보자.

# ## Perfomance Metrics

# set up performance metrics functions


def squared_error(actual, pred):
Пример #41
0
	if fields[6] == "Y":
		hired = 1
	else:
		hired = 0

	return LabeledPoint(hired, [years_of_exp,employed,previousEmployers,education_level,top_tier_school,internship])



path = '/home/sejal/Documents/datascience/dataset/data/emp/candidates_hired_past.csv'

r1 = sc.textFile(path)
r2 = r1.map(lambda entry: entry.split(','))

training_data = r2.map(prepare_data_for_DT)

test_data = [10,1,2,2,1,0]

model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1:2, 3:4,4:2,5:2})

predictions = model.predict(test_data)

print("Hire OR No-Hire")
print (predictions)

print (model.toDebugString())
# results = predictions.collect()

# for result in results:
# 	print result
Пример #42
0
pd.DataFrame(dfd.take(5), columns=dfd.columns).transpose()


def labelData(data):
    return data.map(lambda row: LabeledPoint(row[9], [
        row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8],
        row[10], row[11], row[12], row[13], row[14], row[15]
    ]))


trainData, testData = labelData(dfd).randomSplit([0.8, 0.2])

model = DecisionTree.trainClassifier(trainData,
                                     numClasses=4,
                                     maxDepth=6,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxBins=50)

print model.toDebugString()


def getPredictionLabels(model, testData):
    predictions = model.predict(testData.map(lambda r: r.features))
    return predictions.zip(testData.map(lambda r: r.label))


def printMetrics(pred_and_label):
    metrics = MulticlassMetrics(pred_and_label)
    print 'Preicision of 1', metrics.precision(1)
    print 'Preicision of 2', metrics.precision(2)
Пример #43
0
    attack = 1.0
    if line_split[41] == 'normal.':
        attack = 0.0

    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))


training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

t0 = time()
tree_model = DecisionTree.trainClassifier(training_data,
                                          numClasses=2,
                                          categoricalFeaturesInfo={
                                              1: len(protocols),
                                              2: len(services),
                                              3: len(flags)
                                          },
                                          maxDepth=4,
                                          maxBins=100)
tt = time() - t0
print("Classifier trained in {} seconds".format(round(tt, 3)))

t0 = time()
predictions = tree_model.predict(test_data.map(lambda x: x.features))
labels_and_preds = test_data.map(lambda x: x.label).zip(predictions)
test_accuracy = labels_and_preds.filter(
    lambda x: x[0] == x[1]).count() / float(test_data.count())
tt = time() - t0
print("Prediction made in {} seconds. Test accuracy is {}".format(
    round(tt, 3), round(test_accuracy, 3)))
Пример #44
0
print '决策树特征向量长度: ' + str(len(first_point_dt.features))

from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.tree import DecisionTree

help(LinearRegressionWithSGD.train)

linear_model = LinearRegressionWithSGD.train(data,
                                             iterations=10,
                                             step=0.1,
                                             intercept=False)
true_vs_predicted = data.map(
    lambda point: (point.label, linear_model.predict(point.features)))
print '线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5))

dt_model = DecisionTree.trainRegressor(data_dt, {})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print '决策树回归模型对前5个样本的预测值: ' + str(true_vs_predicted_dt.take(5))
print '决策树模型的深度: ' + str(dt_model.depth())
print '决策树模型的叶子节点个数: ' + str(dt_model.numNodes())


def squared_error(actual, pred):
    return (pred - actual)**2


def abs_error(actual, pred):
    return np.abs(pred - actual)
Пример #45
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
    LR_model = LogisticRegressionWithLBFGS.train(trained_hashed)
    LR_prediction_and_labels = check_hashed.map(lambda point: (LR_model.predict(point.features), point.label))
    LR_correct = LR_prediction_and_labels.filter(lambda predicted, actual: predicted == actual)
    LR_accuracy = LR_correct.count() / float(check_hashed.count())
    print ("LR training accuracy:" + str(LR_accuracy * 100) + " %")
    LR_output_dir = 'hdfs://master:9000/user/hadoop/LogisticRegression'
    shutil.rmtree("hdfs://master:9000/user/hadoop/LogisticRegression/metadata", ignore_errors=True)
    LR_model.save(cc, LR_output_dir)

    SVM_model = SVMWithSGD.train(trained_hashed, iterations=10)
    SVM_prediction_and_labels = check_hashed.map(lambda point: (SVM_model.predict(point.features), point.label))
    SVM_model.clearThreshold()
    SVM_correct = SVM_prediction_and_labels.filter(lambda predicted, actual: predicted == actual)
    SVM_accuracy = SVM_correct.count() / float(check_hashed.count())
    print ("SVM training accuracy:" + str(SVM_accuracy * 100) + " %")
    SVM_output = 'hdfs://master:9000/user/hadoop/SVM'
    shutil.rmtree("hdfs://master:9000/user/hadoop/SVM/metadata", ignore_errors=True)
    SVM_model.save(cc, SVM_output)

    model = DecisionTree.trainClassifier(trained_hashed, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)
    predictions = model.predict(check_hashed.map(lambda x: x.features))
    labelsAndPredictions = check_hashed.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(check_hashed.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())
    model.save(cc, "hdfs:///user/hadoop/DT")

Пример #47
0
# get 90% train and 10% test data
data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test = data_with_idx.sample(False, 0.1)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

# make decision tree model 
dt_model = DecisionTree.trainRegressor(train_data,{})

# make predictions and measure error
preds = dt_model.predict(test_data.map(lambda p: p.features))
actual = test_data.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
print "Decision Tree depth: " + str(dt_model.depth())
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

def squared_error(actual, pred): 
	return (pred - actual)**2

def squared_log_error(pred, actual):
	return (np.log(pred + 1) - np.log(actual + 1))**2
Пример #48
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Пример #49
0
	
	#ArrDelay is our response
	#ArrDelay becomes the 8tth column now, and total columns in the data = 12
	label = clean_line_split[0]
	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
Пример #50
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]),
                                          iterations=10)
            LassoWithSGD.train(rdd,
                               initialWeights=array([1.0, 1.0]),
                               iterations=10)
            RidgeRegressionWithSGD.train(rdd,
                                         initialWeights=array([1.0, 1.0]),
                                         iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4,
            maxBins=32)
        with self.assertRaises(Exception) as cm:
            GradientBoostedTrees.trainRegressor(
                rdd,
                categoricalFeaturesInfo=categoricalFeaturesInfo,
                numIterations=4,
                maxBins=1)
Пример #51
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Пример #52
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

# <codecell>

data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt').cache()

# <codecell>

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(data,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=100)

# <codecell>

# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))
labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
trainMSE = labelsAndPredictions.map(lambda (v, p): (v - p) *
                                    (v - p)).sum() / float(data.count())
print('Training Mean Squared Error = ' + str(trainMSE))
print('Learned regression tree model:')
print(model)

# <codecell>
Пример #53
0
        vector)  #se precisar de feature do Feature Selection

    data = pass2libsvm(reduced, sc.parallelize(classes))

    #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector)

    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print 'data devided'

    #trainingData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-16.out',15))

    #testData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-25.out',15))

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numberClasses, {})
    #, maxDepth=5, maxBins=32)

    # let lrm be a LogisticRegression Model

    #model.save(sc, "hdfs://master:9000/user/app/model-"+str(sys.argv[2]+".model"))
    print 'model done'
    #to load the model
    #sameModel = DecisionTreeModel.load(sc, "lrm_model.model")

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))

    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

    metrics = MulticlassMetrics(labelsAndPredictions)
Пример #54
0
# Split each line into a list based on the comma delimiters
csvData = rawData.map(lambda x: x.split(","))

# Convert these lists to LabeledPoints
trainingData = csvData.map(createLabeledPoints)

# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print ('Hire prediction:')
results = predictions.collect()
for result in results:
    print result

# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())
Пример #55
0
header = train_rawDataWithHeader.first()
rawData = train_rawDataWithHeader.filter(lambda x: x != header)
rData = rawData.map(lambda x: x.replace("\"", ""))
train_data = rData.map(lambda x: x.split(","))
#test_data=prepare_data(test)
header = test_rawDataWithHeader.first()
rawData = test_rawDataWithHeader.filter(lambda x: x != header)
rData = rawData.map(lambda x: x.replace("\"", ""))
test_data = rData.map(lambda x: x.split(","))

lines=train_data+test_data
categoriesMap = lines.map(lambda fields: fields[0]).distinct().zipWithIndex().collectAsMap()
train_RDD=train_data.map(lambda r:LabeledPoint(extract_label(r),extract_features(r,categoriesMap,len(r)-1)))
test_RDD=test_data.map(lambda r: (extract_features(r,categoriesMap,len(r)-1),r[-1]))

model=DecisionTree.trainClassifier(train_RDD,numClasses=2,categoricalFeaturesInfo={},impurity='entropy',maxDepth=14,maxBins=9)

count = 0
num = 0
positive = 0
negative = 0
truePositive = 0
trueNegative =0
falsePositive = 0
falseNegative = 0
for data in test_RDD.take(test_data.count()):
    num+=1
    preds = int(model.predict(data[0]))
    #print(str(preds)+' '+str(data[1]))
    if(preds == int(data[1])):count=count+1
    if int(data[1]) == 0: negative += 1
Пример #56
0
def main():
    spark = SparkSession\
        .builder\
        .appName("PythonSQL")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    #Import a messed up  DataFrame (DF) to get Column info
    raw_df= sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferchema='true') \
    .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz")
    features = raw_df.columns
    #Manually specify the correct datatypes for each Column and import new DF
    fields = [
        StructField(field_name, FloatType(), True) for field_name in features
    ]
    fields[0].dataType = IntegerType()
    fields[-1].dataType = IntegerType()
    customSchema = StructType(fields)
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='true') \
    .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz",schema = customSchema)
    df.na.fill(NaN)
    #Prepare feature for computation!
    #Remove features from a list precompiled on correlation criterion!
    counter = 0
    with open('column_refine_list.csv', 'r') as f:
        csvlist = csv.reader(f, delimiter=',')
        for item in csvlist:
            column_to_go = item[:]
    print("Total numer of features to be removed: %d" % (len(column_to_go)))
    print("\n")
    for item in column_to_go:
        df = df.drop(item)
    print("Final number of features is: %d" % (len(df.columns[1:-1])))
    #Decision Tree model Training
    training_points = labelData(df)  #csv_numeric.map(ReducedlabelData)
    training_data, test_data = training_points.randomSplit([0.7, 0.3])
    print(training_data.first())
    t0 = time()
    tree_model = DecisionTree.trainClassifier(training_data,
                                              numClasses=2,
                                              categoricalFeaturesInfo={},
                                              impurity='gini',
                                              maxDepth=04,
                                              maxBins=100)
    tt = time() - t0
    tree_model.save(
        sc, "DTmodel_model_reduced_222red")  #Save the model for future use!
    print("Model trained in : %.4f Sec" % (tt))
    print(tree_model.toDebugString())

    #Making predictions on the test set
    ## Predict
    t0 = time()
    labels_and_preds = getLabelsPredictions(tree_model, test_data)
    test_accuracy = 100 * labels_and_preds.filter(
        lambda (v, p): v == p).count() / float(test_data.count())
    print("Test accuracy is    : %.4f " % (test_accuracy))
    printMCC(labels_and_preds)
    tt = time() - t0
    print("Predictions and metrics computed in : %.4f Sec" % (tt))

    features_new = raw_df.columns
    #Manually specify the correct datatypes for each Column and import new DF
    fields = [
        StructField(field_name, FloatType(), True)
        for field_name in features_new
    ]
    fields[0].dataType = IntegerType()
    fields[-1].dataType = IntegerType()
    customSchema = StructType(fields)
    df.select(
        df.columns
    ).write.format('com.databricks.spark.csv').options(header='true').save(
        '/data/ganesh/BigData/Bosch/Source/feature_reduction/numeric/feature_reduction_V2/df_739.csv',
        schema=customSchema)
    sc.stop()
Пример #57
0
# 查看資料前處理結果
# print(labelpointRDD.first())

# 以randomSplit隨機方式,依照3:1 (75%:25%) 比例,將資料分為train set與test set
(trainData, testData) = labelpointRDD.randomSplit([3, 1])
# print(testData.count())

# 為加快程式的執行效率,將train set與test set暫存在記憶體中
trainData.persist()
testData.persist()

# tune參數
params = [5, 10, 15, 20]
for i in params:
    # 使用Spark MLlib支援的決策樹
    model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=i, maxBins=15)
    # 使用model.predict對testDat作預測
    score = model.predict(testData.map(lambda p: p.features))
    # 印出預測的結果
    # score.foreach(print)
    print(score.collect())  # all
    # print(score.take(2))  # first two
    # print(score.count())  # 有幾筆
    # 將預測結果與真實label結合起來
    scoreAndLabels = score.zip(testData.map(lambda p: p.label))
    # 使用MulticlassMetrics做出confusionMatrix,計算Accuracy,Recall,Precision
    metrics = MulticlassMetrics(scoreAndLabels)
    print(metrics.confusionMatrix())
    print("Accuracy = %s" % metrics.accuracy)
    print("Recall = %s" % metrics.recall(0))
    print("Precision = %s" % metrics.precision(0))
Пример #58
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        if dimension_columns is None:
            dimensions = self._dimension_columns
        self._target_dimension = measure_columns[0]
        dimension = self._target_dimension
        max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL
        max_num_levels = min(max_num_levels,
                             round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [
            dim for dim in self._dimension_columns
            if self._metaParser.get_num_unique_values(dim) <= max_num_levels
        ]
        all_measures = [
            x for x in self._measure_columns if x != self._target_dimension
        ]
        self.transform_data_frames()
        decision_tree_result = DecisionTreeResult()
        cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions]
        if len(cat_feature_info) > 0:
            max_length = max(cat_feature_info)
        else:
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        # print cat_feature_info
        if self._pandas_flag:
            dimension_classes = self._data_frame[dimension].nunique()
            self._data_frame = self._data_frame[[dimension] + all_dimensions +
                                                all_measures]
            x = self._data_frame.drop(dimension, axis=1)
            y = self._data_frame[dimension]
            for i in x.columns:
                x[i] = x[i].fillna(x[i].mode()[0])
            model = DecisionTreeRegressor(max_depth=6)
            model = model.fit(x, y)
            output_result = self.tree_to_code(model, list(x.columns))
            output_result = list(map(lambda x: x.strip(), output_result))
            print(output_result, "output_result")
        else:
            dimension_classes = self._data_frame.select(
                dimension).distinct().count()
            self._data_frame = self._data_frame[[dimension] + all_dimensions +
                                                all_measures]
            data = self._data_frame.rdd.map(
                lambda x: LabeledPoint(x[0], x[1:]))
            (trainingData, testData) = data.randomSplit([1.0, 0.0])
            # TO DO : set maxBins at least equal to the max level of categories in dimension column
            model = DecisionTree.trainClassifier(
                trainingData,
                numClasses=dimension_classes,
                categoricalFeaturesInfo=cat_feature_info,
                impurity='gini',
                maxDepth=6,
                maxBins=max_length)
            output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame,
                                       self._pandas_flag)
        self._new_tree = self.generate_new_tree(decision_tree)
        node_list = self.node_name_extractor(self._new_tree)
        node_list = list(self.flatten(node_list))
        correct_count_list = [i[0] for i in self._count_list]
        tree_dict = dict(list(zip(node_list, correct_count_list)))
        #self._new_tree = self.generate_new_tree_total(decision_tree)
        self._new_tree = self.wrap_tree(self._new_tree, tree_dict)
        self._path_dict = self.path_dict_creator(node_list, self._new_tree)
        # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree)
        # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_params(self._new_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability, self._path_dict)
        decision_tree_result.set_target_map(
            self._mapping_dict[self._target_dimension], self._aggr_data,
            self._important_vars)

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "dtreeTrainingEnd",\
        #                             "info",\
        #                             self._scriptStages["dtreeTrainingEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "dtreeTrainingEnd",
            "info",
            weightKey="script")

        # print decision_tree_result
        return decision_tree_result
Пример #59
0
from numpy import array
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]
model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {})
print(model)
Пример #60
0
# <codecell>

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

# <codecell>

data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt').cache()
data.take(5)

# <codecell>

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=100)

# <codecell>

predictions = model.predict(data.map(lambda x: x.features))
labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
labelsAndPredictions.take(10)

# <codecell>

trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(data.count())
print('Training Error = ' + str(trainErr))
print('Learned classification tree model:')
print(model)

# <codecell>