Exemplo n.º 1
0
    def trainClassifier(self):
        # get the current time
        current = time()

        # get the tags
        tags    = self.tags
        numeric = self.numeric
        x       = self.x
        y       = self.y

        # get the training data
        training_data = self.training_labeled

        # start training the tree model
        self.tree_model = DecisionTree.trainClassifier(
                            training_data,
                            numClasses=4,
                            categoricalFeaturesInfo={0 : len(tags), 1 : len(numeric), 2 : len(x), 3 : len(y)},
                            impurity="gini",
                            maxDepth=5,
                            maxBins=1000)

        print self.tree_model

        # total time
        total = time() - current

        print "Classifier trained in {} seconds.".format(round(total, 3))

        # start evaluating the model
        self.evaluate()
Exemplo n.º 2
0
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)

    # Train the models
    decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
                                         impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)

    # Test the model
    testDecisionTree(decisionTreeModel, parallelized_test_set)
def decisionTree(trainingRDD, trainingRDDHashed, testRDDHashed, testRDD):
    # Get size of RDD
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Train the Decision Tree Model
    trainedModel = DecisionTree.trainClassifier(
        trainingRDD,
        numClasses=2,
        categoricalFeaturesInfo={},
        impurity='gini',
        maxDepth=2,
        maxBins=3)
    # Test the Model on the Training Set
    predictions = trainedModel.predict(trainingRDD.map(lambda x: x.features))
    labelsAndPredictions = trainingRDD.map(
        lambda lp: lp.label).zip(predictions).countByValue()
    # Map to Dictionary for obtaining Results
    resultsValidation = defaultdict(lambda: 0, labelsAndPredictions)
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Get F-Score and Accuracy Value
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    # Test the Model on the Test Set
    predictions = trainedModel.predict(testRDD.map(lambda x: x.features))
    labelsAndPredictions = testRDD.map(
        lambda lp: lp.label).zip(predictions).countByValue()
    # Map to Dictionary for obtaining Results
    resultsTest = defaultdict(lambda: 0, labelsAndPredictions)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Decision Tree')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
Exemplo n.º 4
0
def generateDecisionTree():
    if os.path.exists(DT_PATH):
        print("DT_PATH Already available")
        return

    global model
    data = sc.textFile(F_PATH).map(parseLine)

    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)

    model = DecisionTree.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ', str(testErr))

    print('Learned classification tree model:')
    print(model.toDebugString())

    modelStatistics(labelsAndPredictions)

    # Save and load model
    model.save(sc, DT_PATH)
    print("Decision Tree model saved!")
Exemplo n.º 5
0
 def train(self, num_classes=2, categorical_features=None, max_depth=5):
     categorical_features = categorical_features or {}
     model = DecisionTree.trainClassifier(
         self._labeled_feature_vector_rdd(),
         numClasses=num_classes, 
         categoricalFeaturesInfo=categorical_features,
         maxDepth=max_depth)
     return DecisionTreeModel(model, self.feature_cols)
Exemplo n.º 6
0
def trainModel(trainingData):
	print '\nTraining Decision Tree model started'
	Utils.logTime()

	model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5,maxBins=32)
	print '\nTraining Decision Tree model finished'
	Utils.logTime()
	return model
Exemplo n.º 7
0
def RunDecisionTree(tf):
	rdd = tf.map(parseAsLabeledPoints)
	train, test = rdd.randomSplit([.8, .2])
	model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=100)
	predictions = model.predict(train.map(lambda x: x.features))
	labelsAndPredictions = train.map(lambda lp: lp.label).zip(predictions)
	trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
	print('Training Error = ' + str(trainErr))
Exemplo n.º 8
0
def RunDecisionTree(tf):
	rdd = tf.map(parseAsLabeledPoints)
	train, test = rdd.randomSplit([.8, .2])
	numCat = len(genCats)
	model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},
		impurity='gini', maxDepth=5, maxBins=100)
	# Evaluate model on training instances and compute training error
	predictions = model.predict(test.map(lambda x: x.features))
	labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
	trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
	print('Accuracy of decision tree = ', 1-trainErr)
	print('Training Error = ' + str(trainErr))
Exemplo n.º 9
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins):
	
	decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={},
														impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins)


	predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
	trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions)

	predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
	testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions)

	return decisionTreeModel
Exemplo n.º 11
0
def classify(sc, sample):
    def ff(x):
        newsample = []
        nl = ["rainy","sad","lack"]
        ml = ["cloudy","soso","enough"]
        pl = ["sunny","happy","most"]
        for i in x:
            if i in nl:
                newsample.append(0)
            elif i in ml:
                newsample.append(1)
            elif i in pl:
                newsample.append(2)
        return newsample

    f = lambda x:1 if x=="yes" else 0
    traindata = sc.parallelize(sample).map(lambda x:(ff(x[0]),f(x[1]))) 
    traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
    testdata = traindata.first()
    print testdata

    ######
#    print "logistic"
#    lrModel = LogisticRegressionWithSGD.train(traindata, 10)
#    prediction = lrModel.predict(testdata.features)
#    print prediction
    

    #####
#    print "svm"
#    svmModel = SVMWithSGD.train(traindata, 10)
#    prediction = svmModel.predict(testdata.features)
#    print prediction
#
#
#    ####
#    print "naive bayes"
#    nbModel = NaiveBayes.train(traindata)
#    prediction = nbModel.predict(testdata.features)
#    print prediction
#
#
#    ####
    print "decesion tree"
    detreeModel = DecisionTree.trainClassifier(traindata, 2, {})
    prediction = detreeModel.predict(testdata.features)
    print prediction
def main(input_file):

    sc = pyspark.SparkContext(appName="DecisionTree")

    data = MLUtils.loadLabeledPoints(sc, input_file)

    trainingData, testData = data.randomSplit([0.70, 0.3])
    # Cache in memory for faster training
    trainingData.cache()

    model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
                 categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)

    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    # print tree_model.toDebugString()
    print ""
    print ""
    print "Test Erros: {}".format(round(testErr,4))
	def trainModel(self, vectSpace, path):
		try:

			if self.type == 'NaiveBayes':
				model = NaiveBayes.train(vectSpace)
			elif self.type == 'DecisionTree':
				model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)

			if not os.path.exists(path):
				os.makedirs(path)
			else:
				shutil.rmtree(path)
				os.makedirs(path)

			model.save(self.sc, path)

		except:
			print "Unexpected error:", sys.exc_info()[0]
		 	raise
		return model
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile):
	filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect()
	clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect()
	clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec))
	labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1]))

	trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3])

	decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum,
														categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins)

	predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
	trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions)

	predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
	testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
	eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions)

	return decisionTreeModel
Exemplo n.º 15
0
def create_model(name, training):
    if name == 'logistic':
        print_box()
        print "Logistic Regression Model"
        print_box()
        model = LogisticRegressionWithLBFGS.train(training)
    elif name == 'tree':
        print_box()
        print "Decision Tree Model"
        print_box()
        model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)
    elif name == 'rf':
        print_box()
        print "Random Forest Model"
        print_box()
        model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                    numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)

    return model
Exemplo n.º 16
0
def trainOptimalModel(trainingData, testData):
	print "\nTraining optimal Decision Tree model started!"
	Utils.logTime()

	impurityVals = ['gini', 'entropy']
	maxDepthVals = [3,4,5,6,7]
	maxBinsVals = [8,16,32]

	optimalModel = None
	optimalMaxDepth = None
	optimalImpurity = None
	optimalBinsVal = None
	minError = None

	try:
		for curImpurity in impurityVals:
			for curMaxDepth in maxDepthVals:
				for curMaxBins in maxBinsVals:
					model = DecisionTree.trainClassifier(trainingData, 
														 numClasses=2, 
														 categoricalFeaturesInfo={}, 
														 impurity=curImpurity, 
														 maxDepth=curMaxDepth,
														 maxBins=curMaxBins)
					testErr, PR, ROC = Evaluation.evaluate(model, testData)
					if testErr < minError or not minError:
						minError = testErr
						optimalImpurity = curImpurity
						optimalMaxDepth = curMaxDepth
						optimalBinsVal = curMaxBins
						optimalModel = model
	except:
		msg = "\nException during model training with below parameters:"
		msg += "\timpurity: " + str(curImpurity)
		msg += "\tmaxDepth: " + str(curMaxDepth)
		msg += "\tmaxBins: " + str(curMaxBins)
		Utils.logMessage(msg)

	logMessage(optimalModel, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
	return optimalModel 
Exemplo n.º 17
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
Exemplo n.º 18
0
    #
    #    print('\n== ACCURACY BAYES : ', accuracy_bayes , '==')
    #
    #    file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n")
    #    file.write('\n-> ACCURACY BAYES : ' + str(accuracy_bayes) + '\n')

    print("\n===================================================== ")
    print("=================== DECISION TREE =================== ")
    print("===================== (Entropy) ===================== ")
    print("=====================================================\n")

    print("\n=================== Training ================== \n")

    model_decision_tree_entropy = DecisionTree.trainClassifier(
        training,
        categoricalFeaturesInfo={},
        impurity="entropy",
        maxDepth=5,
        numClasses=2)
    print("Done : DT entropy training")

    print("\n=================== Testing =================== \n")

    #decision tree entropy
    predictions_decision_tree_enptropy = model_decision_tree_entropy.predict(
        test)
    num_pos_entropy = predictions_decision_tree_enptropy.countByValue()[1.0]
    num_neg_entropy = predictions_decision_tree_enptropy.countByValue()[0.0]

    #decision tree gini
    print("\n== PREDICTION ENTROPY : ==\n")
    print("- Positive : ", num_pos_entropy)
Exemplo n.º 19
0
    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'carbon2.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=5,
                                         categoricalFeaturesInfo={
                                             0: 5,
                                             1: 5
                                         },
                                         impurity='entropy',
                                         maxDepth=5,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
Exemplo n.º 20
0
    # Load data.
    dataPath = 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)

    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    model = DecisionTree.trainClassifier(
        reindexedData,
        numClasses=numClasses,
        categoricalFeaturesInfo=categoricalFeaturesInfo)
    # Print learned tree and stats.
    print("Trained DecisionTree for classification:")
    print("  Model numNodes: %d" % model.numNodes())
    print("  Model depth: %d" % model.depth())
    print("  Training accuracy: %g" % getAccuracy(model, reindexedData))
    if model.numNodes() < 20:
        print(model.toDebugString())
    else:
        print(model)

    sc.stop()
Exemplo n.º 21
0
pd.DataFrame(dfa.take(5), columns=dfa.columns).transpose()


def labelData(data):
    return data.map(lambda row: LabeledPoint(row[9], [
        row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8],
        row[10], row[11], row[12], row[13], row[14], row[15]
    ]))


trainData, testData = labelData(dfa).randomSplit([0.8, 0.2])

model = DecisionTree.trainClassifier(trainData,
                                     numClasses=3,
                                     maxDepth=10,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxBins=32)

print model.toDebugString()


def getPredictionLabels(model, testData):
    predictions = model.predict(testData.map(lambda r: r.features))
    return predictions.zip(testData.map(lambda r: r.label))


def printMetrics(pred_and_label):
    metrics = MulticlassMetrics(pred_and_label)
    print 'Precision of 0', metrics.precision(0)
    print 'Precision of 1', metrics.precision(1)
Exemplo n.º 22
0
# Split each line into a list based on the comma delimiters
csvData = rawData.map(lambda x: x.split(","))

# Convert these lists to LabeledPoints
trainingData = csvData.map(createLabeledPoints)

# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [ array([10, 1, 3, 1, 0, 0])]
testData = sc.parallelize(testCandidates)

# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData, numClasses=2,
                                     categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print ('Hire prediction:')
results = predictions.collect()
for result in results:
    print result

# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())
Exemplo n.º 23
0
            print >> f1, string.decode('utf8')

    elif algorithm == "DecisionTree":  #DecisionTree 201612 gini 10 32 1 0
        #         numClasses  = int(sys.argv[2])
        impurity = str(sys.argv[3])
        maxDepth = int(sys.argv[4])
        maxBins = int(sys.argv[5])
        minInstancesPerNode = int(sys.argv[6])
        minInfoGain = float(sys.argv[7])
        model = DecisionTree.trainClassifier(trainingData,
                                             numClasses=classNumber,
                                             categoricalFeaturesInfo={
                                                 6: 3,
                                                 7: 3,
                                                 8: 3,
                                                 9: 5,
                                                 10: 5,
                                                 11: 5
                                             },
                                             impurity=impurity,
                                             maxDepth=maxDepth,
                                             maxBins=maxBins)

        # Evaluate model on test instances and compute test error
        predictions = model.predict(testData.map(lambda x: x.features))
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
            predictions)
        testErr = labelsAndPredictions.filter(
            lambda (v, p): v != p).count() / float(testData.count())
        with codecs.open('results.txt', "w", "utf-8") as f1:
            string = 'testErr:' + str(testErr)
Exemplo n.º 24
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        if dimension_columns is None:
            dimensions = self._dimension_columns
        self._target_dimension = measure_columns[0]
        dimension = self._target_dimension
        max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL
        max_num_levels = min(max_num_levels,
                             round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [
            dim for dim in self._dimension_columns
            if self._metaParser.get_num_unique_values(dim) <= max_num_levels
        ]
        all_measures = [
            x for x in self._measure_columns if x != self._target_dimension
        ]
        self.transform_data_frames()
        decision_tree_result = DecisionTreeResult()
        cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions]
        if len(cat_feature_info) > 0:
            max_length = max(cat_feature_info)
        else:
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        # print cat_feature_info
        if self._pandas_flag:
            dimension_classes = self._data_frame[dimension].nunique()
            self._data_frame = self._data_frame[[dimension] + all_dimensions +
                                                all_measures]
            x = self._data_frame.drop(dimension, axis=1)
            y = self._data_frame[dimension]
            for i in x.columns:
                x[i] = x[i].fillna(x[i].mode()[0])
            model = DecisionTreeRegressor(max_depth=6)
            model = model.fit(x, y)
            output_result = self.tree_to_code(model, list(x.columns))
            output_result = list(map(lambda x: x.strip(), output_result))
            print(output_result, "output_result")
        else:
            dimension_classes = self._data_frame.select(
                dimension).distinct().count()
            self._data_frame = self._data_frame[[dimension] + all_dimensions +
                                                all_measures]
            data = self._data_frame.rdd.map(
                lambda x: LabeledPoint(x[0], x[1:]))
            (trainingData, testData) = data.randomSplit([1.0, 0.0])
            # TO DO : set maxBins at least equal to the max level of categories in dimension column
            model = DecisionTree.trainClassifier(
                trainingData,
                numClasses=dimension_classes,
                categoricalFeaturesInfo=cat_feature_info,
                impurity='gini',
                maxDepth=6,
                maxBins=max_length)
            output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame,
                                       self._pandas_flag)
        self._new_tree = self.generate_new_tree(decision_tree)
        node_list = self.node_name_extractor(self._new_tree)
        node_list = list(self.flatten(node_list))
        correct_count_list = [i[0] for i in self._count_list]
        tree_dict = dict(list(zip(node_list, correct_count_list)))
        #self._new_tree = self.generate_new_tree_total(decision_tree)
        self._new_tree = self.wrap_tree(self._new_tree, tree_dict)
        self._path_dict = self.path_dict_creator(node_list, self._new_tree)
        # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree)
        # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_params(self._new_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability, self._path_dict)
        decision_tree_result.set_target_map(
            self._mapping_dict[self._target_dimension], self._aggr_data,
            self._important_vars)

        # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10
        # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
        #                             "dtreeTrainingEnd",\
        #                             "info",\
        #                             self._scriptStages["dtreeTrainingEnd"]["summary"],\
        #                             self._completionStatus,\
        #                             self._completionStatus)
        # CommonUtils.save_progress_message(self._messageURL,progressMessage)
        # self._dataframe_context.update_completion_status(self._completionStatus)
        CommonUtils.create_update_and_save_progress_message(
            self._dataframe_context,
            self._scriptWeightDict,
            self._scriptStages,
            self._analysisName,
            "dtreeTrainingEnd",
            "info",
            weightKey="script")

        # print decision_tree_result
        return decision_tree_result
def train_validate_test_rpart():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406
    
    sample_from_benign = benign.sample(False, 50000/n_benign)
    pat_proc = anom.unionAll(sample_from_benign)
    print("pat_proc.count() = " + str(pat_proc.count())) #99,227
    
    all_columns = pat_proc.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]
    
    #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. 
       #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. 
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    pat_proc = pat_proc.rdd
    print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = pat_proc.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    #Populate the actual categoricalFeaturesInfo dictionary
    cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()])
    procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))])
    cat_features_info = dict(cat_features_info.items() + procedure_features_info.items())
    
    t0 = time()
    model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) 
    #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all".
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds.
    print(model)
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds
    
    labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label
    test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size)
    fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() 
    fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272.
    #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018
    print model.toDebugString()
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return model 
Exemplo n.º 26
0
header = train_rawDataWithHeader.first()
rawData = train_rawDataWithHeader.filter(lambda x: x != header)
rData = rawData.map(lambda x: x.replace("\"", ""))
train_data = rData.map(lambda x: x.split(","))
#test_data=prepare_data(test)
header = test_rawDataWithHeader.first()
rawData = test_rawDataWithHeader.filter(lambda x: x != header)
rData = rawData.map(lambda x: x.replace("\"", ""))
test_data = rData.map(lambda x: x.split(","))

lines=train_data+test_data
categoriesMap = lines.map(lambda fields: fields[0]).distinct().zipWithIndex().collectAsMap()
train_RDD=train_data.map(lambda r:LabeledPoint(extract_label(r),extract_features(r,categoriesMap,len(r)-1)))
test_RDD=test_data.map(lambda r: (extract_features(r,categoriesMap,len(r)-1),r[-1]))

model=DecisionTree.trainClassifier(train_RDD,numClasses=2,categoricalFeaturesInfo={},impurity='entropy',maxDepth=14,maxBins=9)

count = 0
num = 0
positive = 0
negative = 0
truePositive = 0
trueNegative =0
falsePositive = 0
falseNegative = 0
for data in test_RDD.take(test_data.count()):
    num+=1
    preds = int(model.predict(data[0]))
    #print(str(preds)+' '+str(data[1]))
    if(preds == int(data[1])):count=count+1
    if int(data[1]) == 0: negative += 1
Exemplo n.º 27
0
# 查看資料前處理結果
# print(labelpointRDD.first())

# 以randomSplit隨機方式,依照3:1 (75%:25%) 比例,將資料分為train set與test set
(trainData, testData) = labelpointRDD.randomSplit([3, 1])
# print(testData.count())

# 為加快程式的執行效率,將train set與test set暫存在記憶體中
trainData.persist()
testData.persist()

# tune參數
params = [5, 10, 15, 20]
for i in params:
    # 使用Spark MLlib支援的決策樹
    model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=i, maxBins=15)
    # 使用model.predict對testDat作預測
    score = model.predict(testData.map(lambda p: p.features))
    # 印出預測的結果
    # score.foreach(print)
    print(score.collect())  # all
    # print(score.take(2))  # first two
    # print(score.count())  # 有幾筆
    # 將預測結果與真實label結合起來
    scoreAndLabels = score.zip(testData.map(lambda p: p.label))
    # 使用MulticlassMetrics做出confusionMatrix,計算Accuracy,Recall,Precision
    metrics = MulticlassMetrics(scoreAndLabels)
    print(metrics.confusionMatrix())
    print("Accuracy = %s" % metrics.accuracy)
    print("Recall = %s" % metrics.recall(0))
    print("Precision = %s" % metrics.precision(0))
    LR_model = LogisticRegressionWithLBFGS.train(trained_hashed)
    LR_prediction_and_labels = check_hashed.map(lambda point: (LR_model.predict(point.features), point.label))
    LR_correct = LR_prediction_and_labels.filter(lambda predicted, actual: predicted == actual)
    LR_accuracy = LR_correct.count() / float(check_hashed.count())
    print ("LR training accuracy:" + str(LR_accuracy * 100) + " %")
    LR_output_dir = 'hdfs://master:9000/user/hadoop/LogisticRegression'
    shutil.rmtree("hdfs://master:9000/user/hadoop/LogisticRegression/metadata", ignore_errors=True)
    LR_model.save(cc, LR_output_dir)

    SVM_model = SVMWithSGD.train(trained_hashed, iterations=10)
    SVM_prediction_and_labels = check_hashed.map(lambda point: (SVM_model.predict(point.features), point.label))
    SVM_model.clearThreshold()
    SVM_correct = SVM_prediction_and_labels.filter(lambda predicted, actual: predicted == actual)
    SVM_accuracy = SVM_correct.count() / float(check_hashed.count())
    print ("SVM training accuracy:" + str(SVM_accuracy * 100) + " %")
    SVM_output = 'hdfs://master:9000/user/hadoop/SVM'
    shutil.rmtree("hdfs://master:9000/user/hadoop/SVM/metadata", ignore_errors=True)
    SVM_model.save(cc, SVM_output)

    model = DecisionTree.trainClassifier(trained_hashed, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)
    predictions = model.predict(check_hashed.map(lambda x: x.features))
    labelsAndPredictions = check_hashed.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(check_hashed.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())
    model.save(cc, "hdfs:///user/hadoop/DT")

Exemplo n.º 29
0
from numpy import array
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]
model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {})
print(model)
Exemplo n.º 30
0
    attack = 0.0
    if len(line_split) >= 9 and line_split[9] == 'title':
        attack = 1.0

    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))


training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(
    training_data,
    numClasses=2,
    categoricalFeaturesInfo={0: len(protocols)},
    impurity='gini',
    maxDepth=4,
    maxBins=100)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt, 3))

predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(
    test_data.count())
tt = time() - t0
Exemplo n.º 31
0
# Create a test candidate, with 10 years of experience, currently employed,
# 3 previous employers, a BS degree, but from a non-top-tier school where
# he or she did not do an internship. You could of course load up a whole
# huge RDD of test candidates from disk, too.
testCandidates = [array([10, 1, 3, 1, 0, 0])]
#Create RDD:
testData = sc.parallelize(testCandidates)

# Train our DecisionTree classifier using our data set
model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={
                                         1: 2,
                                         3: 4,
                                         4: 2,
                                         5: 2
                                     },
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

# Now get predictions for our unknown candidates. (Note, you could separate
# the source data into a training set and a test set while tuning
# parameters and measure accuracy as you go!)
predictions = model.predict(testData)
print('Hire prediction:')

#Uptil this point Spark doesn't do anything, just setting a Spark format up.
results = predictions.collect()
for result in results:
from pyspark import SparkContext
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

import json
from bson import json_util
from bson.json_util import dumps

if __name__ == "__main__":

	sc = SparkContext(appName="DecisionTreeClassification")

	raw_data = MLUtils.loadLibSVMFile(sc, '/home/hechem/spark-campaign-classification/test/data/sample_libsvm_data.txt')
	(trainingDataSet, testDataSet) = raw_data.randomSplit([0.7, 0.3])

	tree = DecisionTree.trainClassifier(trainingDataSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=30)

	predictions = tree.predict(testDataSet.map(lambda x: x.features))
	labelsAndPredictions = testDataSet.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testDataSet.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification tree model:')
	print(tree.toDebugString())
	tree_to_json = tree.toDebugString()
	
	# Parser
	def parse(lines):
		block = []
		while lines :
			
			if lines[0].startswith('If'):
Exemplo n.º 33
0
sc = SparkContext()

result = {1.0: 'yes', 0.0: 'no'}

# 机器学习实战第三章中的鱼类归属数据源
data = [
    LabeledPoint(1, [1, 1]),
    LabeledPoint(1, [1, 1]),
    LabeledPoint(0, [1, 0]),
    LabeledPoint(0, [0, 1]),
    LabeledPoint(0, [0, 1])
]

rdd = sc.parallelize(data)

print '------------------------------------'
print type(rdd), dir(rdd)
print rdd.collect()
print '------------------------------------'
model = DecisionTree.trainClassifier(rdd, 3, {})

# print(model)

print '********************************************************'
print(model.toDebugString())
print "test [1,0]: %s" % (result[model.predict(array([1, 0]))])
print "test [1,1]: %s" % (result[model.predict(array([1, 1]))])
print "test [0,0]: %s" % (result[model.predict(array([0, 0]))])
print '********************************************************'
sc.stop()
Exemplo n.º 34
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        dimension = dimension_columns[0]
        all_dimensions = self._dimension_columns
        all_measures = self._measure_columns
        cat_feature_info = []
        columns_without_dimension = list(x for x in all_dimensions
                                         if x != dimension)
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        for column in all_dimensions:
            mapping_dict[column] = dict(
                enumerate(
                    self._data_frame.select(column).distinct().rdd.map(
                        lambda x: str(x[0])).collect()))
        # for c in mapping_dict:
        #     name = c
        #     reverseMap = {v: k for k, v in mapping_dict[c].iteritems()}
        #     udf = UserDefinedFunction(lambda x: reverseMap[x], StringType())
        #     self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns])

        # converting spark dataframe to pandas for transformation and then back to spark dataframe
        pandasDataFrame = self._data_frame.toPandas()
        for key in mapping_dict:
            pandasDataFrame[key] = pandasDataFrame[key].apply(
                lambda x: 'None' if x == None else x)
            reverseMap = {v: k for k, v in mapping_dict[key].items()}
            pandasDataFrame[key] = pandasDataFrame[key].apply(
                lambda x: reverseMap[x])
        # sqlCtx = SQLContext(self._spark)
        self._data_frame = self._spark.createDataFrame(pandasDataFrame)
        self._mapping_dict = mapping_dict
        for c in columns_without_dimension:
            cat_feature_info.append(
                self._data_frame.select(c).distinct().count())
        if len(cat_feature_info) > 0:
            max_length = max(cat_feature_info)
        else:
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        dimension_classes = self._data_frame.select(
            dimension).distinct().count()
        self._data_frame = self._data_frame[[dimension] +
                                            columns_without_dimension +
                                            all_measures]
        data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:]))
        (trainingData, testData) = data.randomSplit([1.0, 0.0])
        # TO DO : set maxBins at least equal to the max level of categories in dimension column
        model = DecisionTree.trainClassifier(
            trainingData,
            numClasses=dimension_classes,
            categoricalFeaturesInfo=cat_feature_info,
            impurity='gini',
            maxDepth=3,
            maxBins=max_length)
        output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame)
        self.generate_probabilities(decision_tree, dimension)
        # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree)
        # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability)
        decision_tree_result.set_params(decision_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability)

        return decision_tree_result
    # Append Labels
    appendColumn(ensemble_test, rf_test_predict_label)
    appendColumn(ensemble_train, rf_train_predict_label)

# Decision Trees
# C13 - C21
# Build the Model
max_depth = [5, 10, 15, 20]

for i in range(0, len(max_depth), 1):
    m_depth = max_depth[i]

    # Build the Model
    model = DecisionTree.trainClassifier(train_data,
                                         10, {},
                                         impurity='gini',
                                         maxDepth=m_depth)

    rf_train_predict_label = []
    rf_test_predict_label = []

    # Predict Labels
    for j in range(0, len(test_features), 1):
        p_l = model.predict(test_features[j])
        rf_test_predict_label.extend([p_l])

    for j in range(0, len(train_features), 1):
        p_l = model.predict(train_features[j])
        rf_train_predict_label.extend([p_l])

    # Append Labels
Exemplo n.º 36
0
	if fields[6] == "Y":
		hired = 1
	else:
		hired = 0

	return LabeledPoint(hired, [years_of_exp,employed,previousEmployers,education_level,top_tier_school,internship])



path = '/home/sejal/Documents/datascience/dataset/data/emp/candidates_hired_past.csv'

r1 = sc.textFile(path)
r2 = r1.map(lambda entry: entry.split(','))

training_data = r2.map(prepare_data_for_DT)

test_data = [10,1,2,2,1,0]

model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1:2, 3:4,4:2,5:2})

predictions = model.predict(test_data)

print("Hire OR No-Hire")
print (predictions)

print (model.toDebugString())
# results = predictions.collect()

# for result in results:
# 	print result
Exemplo n.º 37
0
#exec(open("./doweathclass_dectree.py").read())

# ---------------- now try decision tree ------------
from pyspark.mllib.tree import DecisionTree
dt_model = DecisionTree.trainClassifier(datax_rdd,
                                        2, {},
                                        impurity='entropy',
                                        maxDepth=3,
                                        maxBins=32,
                                        minInstancesPerNode=2)

#maxDepth and maxBins
#{} could be categorical feature list,
# To do regression, have no numclasses,and use trainRegression function
print(dt_model.toDebugString())

#results in this:
#DecisionTreeModel classifier of depth 3 with 9 nodes
#  If (feature 1 <= 0.0)
#   If (feature 4 <= 80.0)
#    If (feature 3 <= 68.0)
#     Predict: 0.0
#    Else (feature 3 > 68.0)
#     Predict: 1.0
#   Else (feature 4 > 80.0)
#    If (feature 0 <= 0.0)
#     Predict: 0.0
#    Else (feature 0 > 0.0)
#     Predict: 0.0
#  Else (feature 1 > 0.0)
#   Predict: 1.0
Exemplo n.º 38
0
def main():
    spark = SparkSession\
        .builder\
        .appName("PythonSQL")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    #Import a messed up  DataFrame (DF) to get Column info
    raw_df= sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferchema='true') \
    .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz")
    features = raw_df.columns
    #Manually specify the correct datatypes for each Column and import new DF
    fields = [
        StructField(field_name, FloatType(), True) for field_name in features
    ]
    fields[0].dataType = IntegerType()
    fields[-1].dataType = IntegerType()
    customSchema = StructType(fields)
    df = sqlContext.read.format('com.databricks.spark.csv').options(header='true') \
    .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz",schema = customSchema)
    df.na.fill(NaN)
    #Prepare feature for computation!
    #Remove features from a list precompiled on correlation criterion!
    counter = 0
    with open('column_refine_list.csv', 'r') as f:
        csvlist = csv.reader(f, delimiter=',')
        for item in csvlist:
            column_to_go = item[:]
    print("Total numer of features to be removed: %d" % (len(column_to_go)))
    print("\n")
    for item in column_to_go:
        df = df.drop(item)
    print("Final number of features is: %d" % (len(df.columns[1:-1])))
    #Decision Tree model Training
    training_points = labelData(df)  #csv_numeric.map(ReducedlabelData)
    training_data, test_data = training_points.randomSplit([0.7, 0.3])
    print(training_data.first())
    t0 = time()
    tree_model = DecisionTree.trainClassifier(training_data,
                                              numClasses=2,
                                              categoricalFeaturesInfo={},
                                              impurity='gini',
                                              maxDepth=04,
                                              maxBins=100)
    tt = time() - t0
    tree_model.save(
        sc, "DTmodel_model_reduced_222red")  #Save the model for future use!
    print("Model trained in : %.4f Sec" % (tt))
    print(tree_model.toDebugString())

    #Making predictions on the test set
    ## Predict
    t0 = time()
    labels_and_preds = getLabelsPredictions(tree_model, test_data)
    test_accuracy = 100 * labels_and_preds.filter(
        lambda (v, p): v == p).count() / float(test_data.count())
    print("Test accuracy is    : %.4f " % (test_accuracy))
    printMCC(labels_and_preds)
    tt = time() - t0
    print("Predictions and metrics computed in : %.4f Sec" % (tt))

    features_new = raw_df.columns
    #Manually specify the correct datatypes for each Column and import new DF
    fields = [
        StructField(field_name, FloatType(), True)
        for field_name in features_new
    ]
    fields[0].dataType = IntegerType()
    fields[-1].dataType = IntegerType()
    customSchema = StructType(fields)
    df.select(
        df.columns
    ).write.format('com.databricks.spark.csv').options(header='true').save(
        '/data/ganesh/BigData/Bosch/Source/feature_reduction/numeric/feature_reduction_V2/df_739.csv',
        schema=customSchema)
    sc.stop()
Exemplo n.º 39
0
        7: len(destination_mapping.value)
    }

    splits = text_rdd.randomSplit([0.7, 0.3])
    (training_rdd, test_rdd) = (splits[0], splits[1])
    training_data = training_rdd.map(
        Utils.parse_flight).map(lambda rdd: Utils.create_labeled_point(
            rdd, carrier_mapping.value, origin_mapping.value,
            destination_mapping.value))

    classes_count = 2
    impurity = "gini"
    max_depth = 9
    max_bins = 7000
    model = DecisionTree.trainClassifier(training_data, classes_count,
                                         categorical_features_info, impurity,
                                         max_depth, max_bins)

    Utils.save_model_to_grid(model, "DecisionTreeFlightModel", sc)
    save_mapping(carrier_mapping.value, "CarrierMap", sqlc)
    save_mapping(origin_mapping.value, "OriginMap", sqlc)
    save_mapping(destination_mapping.value, "DestinationMap", sqlc)

    # Test model
    test_data = test_rdd.map(lambda r: Utils.parse_flight(r)) \
        .map(lambda rdd: Utils.create_labeled_point(rdd,
                                                    carrier_mapping.value,
                                                    origin_mapping.value,
                                                    destination_mapping.value))
    predictions = model.predict(test_data.map(lambda x: x.features))
    labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
Exemplo n.º 40
0
        trainingData_rdd.cache()

        ###############################################################################
        if 'Tree_mllib' in model_list:
            t0 = datetime.datetime.now()

            # Only change parameters here
            tree_mllib_par_dict = {}
            tree_mllib_par_dict['numClasses'] = 2
            tree_mllib_par_dict['impurity'] = 'gini'
            tree_mllib_par_dict['maxDepth'] = 10

            model_mllib_tree = DecisionTree.trainClassifier(
                trainingData_rdd,
                numClasses=tree_mllib_par_dict['numClasses'],
                categoricalFeaturesInfo={},
                impurity=tree_mllib_par_dict['impurity'],
                maxDepth=tree_mllib_par_dict['maxDepth'])

            mllib_model_accuracy('Tree', model_mllib_tree, trainingData_rdd,
                                 testData_rdd)

            modelparameter_dict['Tree_mllib'] = tree_mllib_par_dict

            runtime_write('Tree_mllib', t0)

        ###############################################################################
        # Random forest
        if 'RF_mllib' in model_list:
            t0 = datetime.datetime.now()
Exemplo n.º 41
0
        clean_line_split[3] = len(flags)

    # convert label to binary label
    attack = 1.0
    if len(line_split) >= 42 and line_split[41]=='normal.':
        attack = 0.0

    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))

training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)

# Build the model
t0 = time()
tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, 
                                          categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)},
                                          impurity='gini', maxDepth=4, maxBins=100)
tt = time() - t0

print "Classifier trained in {} seconds".format(round(tt,3))

predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)

t0 = time()
test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count())
tt = time() - t0

print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))

print "Learned classification tree model:"
Exemplo n.º 42
0
sc = SparkContext.getOrCreate()
data = MLUtils.loadLibSVMFile(sc, 'data/dataLibSVM.txt')
print(data)
# NEXT LET'S CREATE THE APPROPRIATE TRAINING AND TEST SETS
# WE'LL BE SETTING THEM AS 70-30, ALONG WITH SETTING A
# RANDOM SEED GENERATOR TO MAKE MY RESULTS REPRODUCIBLE  

(trainingSet, testSet) = data.randomSplit([0.7, 0.3], seed = 7)

##################
# DECISION TREES #
##################

fitDT = DecisionTree.trainClassifier(trainingSet, 
	numClasses=2, 
	categoricalFeaturesInfo={},
	impurity='gini', 
	maxDepth=3, 
	maxBins=32)

print(fitDT.toDebugString())

predictionsDT = fitDT.predict(testSet.map(lambda x: x.features))

labelsAndPredictionsDT = testSet.map(lambda lp: lp.label).zip(predictionsDT)

# Test Error Rate Evaluations

testErrDT = labelsAndPredictionsDT.filter(lambda (v, p): v != p).count() / float(testSet.count())

print('Test Error = {0}'.format(testErrDT))
Exemplo n.º 43
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
Exemplo n.º 44
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr)) ## 0.555395278766

############################ Decision TREE ##############################

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils


def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[-1], values[0:9])

data = sc.textFile("/Users/mac/Desktop/USF/MSAnalytics/Spring1/ML2/ML Project/plays.csv")
header = data.first()
data = data.filter(lambda x: x != header)
parsedData = data.map(parsePoint)

model = DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=30, maxBins=100)

# Evaluate model on training instances and compute training error
predictions = model.predict(parsedData.map(lambda x: x.features))
labelsAndPredictions = parsedData.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedData.count()) #0.09
print('Training Error = ' + str(trainErr))
print('Learned classification tree model:')
print(model)
Exemplo n.º 46
0
    def test_all(self, measure_columns=None, dimension_columns=None):
        measures = measure_columns
        if measure_columns is None:
            measures = self._measure_columns
        self._target_dimension = dimension_columns[0]
        dimension = self._target_dimension

        #####Look into it for Issue 947#################
        max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL
        # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5))
        # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels]
        all_dimensions = [
            dim for dim in self._dimension_columns
            if self._metaParser.get_num_unique_values(dim) <= max_num_levels
        ]
        all_measures = self._measure_columns
        if self._pandas_flag:
            self._data_frame = self._data_frame[all_dimensions + all_measures]
        cat_feature_info = []
        columns_without_dimension = [
            x for x in all_dimensions if x != dimension
        ]
        mapping_dict = {}
        masterMappingDict = {}
        decision_tree_result = DecisionTreeResult()
        decision_tree_result.set_freq_distribution(
            self._metaParser.get_unique_level_dict(self._target_dimension),
            self._important_vars)
        if self._pandas_flag:
            try:
                all_dimensions.remove(dimension)
            except:
                pass
            actual_cols = list(self._data_frame.columns)
            print(actual_cols)
            self._data_frame = pd.get_dummies(self._data_frame,
                                              columns=all_dimensions)
            after_dummy_cols = list(self._data_frame.columns)

            def Diff(li1, li2):
                return (list(
                    list(set(li1) - set(li2)) + list(set(li2) - set(li1))))

            decision_tree_result.dummy_cols = [
                Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)),
                all_dimensions
            ]

        all_dimensions.append(dimension)  #this has been done for scoring error
        if self._pandas_flag:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, [dimension], self._pandas_flag)
        else:
            self._data_frame, mapping_dict = MLUtils.add_string_index(
                self._data_frame, all_dimensions, self._pandas_flag)
        if self._pandas_flag:
            print(self._data_frame.head(1))
        else:
            print(self._data_frame.show(1))
        # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'}
        standard_measure_index = {
            0.0: 'Low',
            1.0: 'Below Average',
            2.0: 'Average',
            3.0: 'Above Average',
            4.0: 'High'
        }
        for measure in all_measures:
            mapping_dict[measure] = standard_measure_index

        for k, v in list(mapping_dict.items()):
            temp = {}
            for k1, v1 in list(v.items()):
                self._alias_dict[v1.replace(",", "")] = v1
                temp[k1] = v1.replace(",", "")
            mapping_dict[k] = temp
        self._mapping_dict = mapping_dict
        if not self._pandas_flag:

            for c in columns_without_dimension:
                if self._pandas_flag:
                    cat_feature_info.append(len(self._data_frame[c].unique()))
                else:
                    cat_feature_info.append(
                        self._data_frame.select(c).distinct().count())
            for c in all_measures:
                cat_feature_info.append(5)
            columns_without_dimension = columns_without_dimension + all_measures
            all_measures = []
            if len(cat_feature_info) > 0:
                max_length = max(cat_feature_info)
            else:
                max_length = 32
        else:
            decision_tree_result.mappingdict = mapping_dict[dimension]
            max_length = 32
        cat_feature_info = dict(enumerate(cat_feature_info))
        if self._pandas_flag:
            dimension_classes = len(self._data_frame[dimension].unique())
        else:
            dimension_classes = self._data_frame.select(
                dimension).distinct().count()
        if not self._pandas_flag:
            self._data_frame = self._data_frame[[dimension] +
                                                columns_without_dimension +
                                                all_measures]
        print("=" * 200)
        # print self._data_frame.rdd.first()
        print("numClasses", dimension_classes)
        print("maxDepth", self._maxDepth)
        decision_tree_result._maxDepth = self._maxDepth
        print("maxBins", max_length)
        print("=" * 200)
        if self._pandas_flag:
            self._data_frame.columns = [
                re.sub('\W+', '_', col.strip())
                for col in self._data_frame.columns
            ]
            x = self._data_frame.drop(dimension, axis=1)
            y = self._data_frame[dimension]
            #tle = LabelEncoder()
            #y = tle.fit_transform(y)
            for i in x.columns:
                x[i] = x[i].fillna(x[i].mode()[0])
            model = DecisionTreeClassifier(criterion='gini',
                                           max_depth=self._maxDepth,
                                           random_state=42)
            model = model.fit(x, y)
            output_result = self.tree_to_code(model, list(x.columns))
            output_result = list(map(lambda x: x.strip(), output_result))
        else:
            data = self._data_frame.rdd.map(
                lambda x: LabeledPoint(x[0], x[1:]))
            (trainingData, testData) = data.randomSplit([1.0, 0.0])
            # TO DO : set maxBins at least equal to the max level of categories in dimension column
            # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length)
            # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables
            #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info
            model = DecisionTree.trainClassifier(
                trainingData,
                numClasses=dimension_classes,
                categoricalFeaturesInfo=cat_feature_info,
                impurity='gini',
                maxDepth=self._maxDepth,
                maxBins=max_length)
            output_result = model.toDebugString()
        decision_tree = self.tree_json(output_result, self._data_frame,
                                       self._pandas_flag)
        self._new_tree = self.generate_new_tree(decision_tree)
        node_list = self.node_name_extractor(self._new_tree)
        node_list = list(self.flatten(node_list))
        correct_count_list = [i[0] for i in self._count_list]
        tree_dict = dict(list(zip(node_list, correct_count_list)))
        self._new_tree = self.wrap_tree(self._new_tree, tree_dict)
        self._path_dict = self.path_dict_creator(node_list, self._new_tree)
        print("===" * 40)
        decision_tree_result.set_params(self._new_tree, self._new_rules,
                                        self._total, self._success,
                                        self._probability, self._path_dict)
        self._completionStatus += old_div(
            self._scriptWeightDict[self._analysisName]["script"] *
            self._scriptStages["treegeneration"]["weight"], 10)
        progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\
                                    "treegeneration",\
                                    "info",\
                                    self._scriptStages["treegeneration"]["summary"],\
                                    self._completionStatus,\
                                    self._completionStatus)
        CommonUtils.save_progress_message(self._messageURL,
                                          progressMessage,
                                          ignore=self._ignoreMsg)
        self._dataframe_context.update_completion_status(
            self._completionStatus)

        return decision_tree_result
Exemplo n.º 47
0
        print(test_data[0].features)
        print(dir(test_data[0]))
        new_test_data = []
        for l_point in test_data:
            new_test_data.append(l_point.features)
        test = sc.parallelize(new_test_data)

        start_time = time.process_time()
        print("Creating tree")
        #dt = SparkDecisionTreeClassifier(featuresCol = 'features',
        #                                 labelCol = 'Target',
        #                                 maxMemoryInMB=2048,
        #                                 minInstancesPerNode=2)
        print("Fitting")
        dtModel = DecisionTree.trainClassifier(train,
                                               numClasses=2,
                                               categoricalFeaturesInfo={},
                                               minInstancesPerNode=2)

        #dtModel = dt.fit(train)
        print("Predicting")
        predictions = dtModel.predict(test).collect()
        #predictions = dtModel.transform(test)
        print("Showing Predictions")
        #predictions.printSchema()
        #predictions.select('rawPrediction', 'prediction', 'probability').show(10)
        print(predictions)
        end_time = time.process_time()

        delta_time = end_time - start_time

        print(delta_time / 60.0)
Exemplo n.º 48
0
    sc = SparkContext(appName="PythonDT")

    # Load data.
    dataPath = 'data/mllib/sample_libsvm_data.txt'
    if len(sys.argv) == 2:
        dataPath = sys.argv[1]
    if not os.path.isfile(dataPath):
        sc.stop()
        usage()
    points = MLUtils.loadLibSVMFile(sc, dataPath)

    # Re-index class labels if needed.
    (reindexedData, origToNewLabels) = reindexClassLabels(points)
    numClasses = len(origToNewLabels)

    # Train a classifier.
    categoricalFeaturesInfo = {}  # no categorical features
    model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
                                         categoricalFeaturesInfo=categoricalFeaturesInfo)
    # Print learned tree and stats.
    print("Trained DecisionTree for classification:")
    print("  Model numNodes: %d" % model.numNodes())
    print("  Model depth: %d" % model.depth())
    print("  Training accuracy: %g" % getAccuracy(model, reindexedData))
    if model.numNodes() < 20:
        print(model.toDebugString())
    else:
        print(model)

    sc.stop()
#	elif line_split[41]=='rootkit.':
#		attack = 19.0
#	elif line_split[41]=='perl.':
#		attack = 20.0
#	elif line_split[41]=='loadmodule.':
#		attack = 21.0
    return LabeledPoint(attack, array([float(x) for x in clean_line_split]))
training_data = csv_data.map(create_labeled_point)
test_data = test_csv_data.map(create_labeled_point)
t0 = time()
print("Classifier training started at: ".format(round(t0, 3)))
tree_model = DecisionTree.trainClassifier(training_data,
                                          numClasses=5,
                                          categoricalFeaturesInfo={
                                              1: len(protocols),
                                              2: len(services),
                                              3: len(flags)
                                          },
                                          impurity='gini',
                                          maxDepth=4,
                                          maxBins=100)
tree_model.save(sc, "/home/ubuntu/project_src/probe_portsweep_model")
tt = time() - t0
print("Classifier trained in {} seconds".format(round(tt, 3)))
predictions = tree_model.predict(test_data.map(lambda p: p.features))
labels_and_preds = test_data.map(lambda p: p.label).zip(predictions)
t0 = time()
test_accuracy = labels_and_preds.filter(
    lambda vp: vp[0] == vp[1]).count() / float(test_data.count())
tt = time() - t0
#print(labels_and_preds.collect())
print(
Exemplo n.º 50
0
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold

# Kfold
if __name__ == "__main__":
	sc = SparkContext('local',appName="Prediction")
	import fileinput
	data_y1, data_y2 = [], []
	for line in fileinput.input("data/feature_extracted_class3.txt"):
		data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]]))
		data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]]))
	total, right, mse = 0, 0, []
	for t in xrange(10):
		kf = KFold(32*40, n_folds=10)
		for train, test in kf:
			data_train_y1, data_train_y2 = [], []
			for i in train:
				data_train_y1.append(data_y1[i])
				data_train_y2.append(data_y2[i])
			clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100)
			clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100)
			for i in test:
				data_test_y1, data_test_y2 = data_y1[i], data_y2[i]
				r1 = clf1.predict(data_test_y1.features)
				r2 = clf2.predict(data_test_y2.features)
				if r1 == data_test_y1.label:
					right += 1
				mse.append(abs(r2-data_test_y2.label))
				total += 1
	print float(right)/total, sum(mse)/len(mse)
Exemplo n.º 51
0
conf = SparkConf().setAppName(appName).setMaster("local[2]") #at least 2
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)

# separate the classification label and the actual data
def parsePoint(line):
  values = [float(x) for x in line.split(',')]
  return LabeledPoint(values[0], values[1:])

# training the model
data = sc.textFile(learning_data_file)
parsedData = data.map(parsePoint)

model = (DecisionTree.trainClassifier(parsedData,
                                      numClasses=2,
                                      categoricalFeaturesInfo={2:9},
                                      impurity='gini',
                                      maxDepth=30))
"""
model = (RandomForest.trainClassifier(parsedData,
                                      numClassesForClassification=2,
                                      numTrees=6,
                                      categoricalFeaturesInfo={2:10},
                                      impurity='gini',
                                      maxDepth=30))
"""
print "====================== model trained ======================"

# streaming and parsing text
lines = ssc.socketTextStream(HOST, QUERY_PORT)
vectors = lines.flatMap(lambda x:x.split(',')).map(lambda l:float(l))
Exemplo n.º 52
0
    """
    values = [float(s) for s in line.strip().split(',')]
    if values[0] == -1: # Convert -1 labels to 0 for MLlib
        values[0] = 0
    elif values[0] > 0:
        values[0] = 1
    return LabeledPoint(values[0], values[1:])

parsed_data = points.map(parsePoint)

print 'After parsing, number of training lines: %s' % parsed_data.count()

parsed_data.take(5)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = parsed_data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=3, maxBins=32)

predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# save the model to
model.save(sc, "decisiontree")

Exemplo n.º 53
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Exemplo n.º 54
0
    # read as data frame
    data = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

    # split into training set and test set
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # convert RDD to LabelPoint
    trainingDataRDD = trainingData.rdd.map(rdd2label_point)
    testDataRDD = testData.rdd.map(rdd2label_point)

    # info about categorical features
    category = {1: 2, 4: 2, 6: 2, 7: 4, 10: 2, 11: 2, 12: 2}

    # Train a DecisionTree model
    decision_tree_model = DecisionTree.trainClassifier(trainingDataRDD, numClasses=2, categoricalFeaturesInfo=category,
                                                       impurity='gini', maxDepth=5, maxBins=12)

    # Train a RandomForest model
    random_forest_model = RandomForest.trainClassifier(trainingDataRDD, numClasses=2, categoricalFeaturesInfo=category,
                                                       impurity='gini', maxDepth=5, maxBins=12,
                                                       numTrees=7, featureSubsetStrategy="auto")

    # the features of the test data
    testDataFeatureRDD = testDataRDD.map(lambda x: x.features)
    testDataCount = testData.count()

    # predict the result
    decision_tree_prediction = decision_tree_model.predict(testDataFeatureRDD).collect()
    random_forest_prediction = random_forest_model.predict(testDataFeatureRDD).collect()

    # metric
Exemplo n.º 55
0
        vector)  #se precisar de feature do Feature Selection

    data = pass2libsvm(reduced, sc.parallelize(classes))

    #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector)

    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print 'data devided'

    #trainingData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-16.out',15))

    #testData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-25.out',15))

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numberClasses, {})
    #, maxDepth=5, maxBins=32)

    # let lrm be a LogisticRegression Model

    #model.save(sc, "hdfs://master:9000/user/app/model-"+str(sys.argv[2]+".model"))
    print 'model done'
    #to load the model
    #sameModel = DecisionTreeModel.load(sc, "lrm_model.model")

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))

    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

    metrics = MulticlassMetrics(labelsAndPredictions)
Exemplo n.º 56
0
# Carga el conjunto de datos 
sc = SparkContext(appName="trees3")
text = sc.textFile("home\cbank.data")
data = (text.map(lambda l : l.split('\t'))
            .map(lambda v : [ int(x.replace("A", "")) for x in v ])
            .map(lambda (a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, c) : (a1, a2-1, a3-1, a4-1, a5-1, a6, a7-1, a8-1, a9-1, a10, a11-1, a12, a13, a14, a15, a16-1, c-1))
            .map(lambda v : LabeledPoint(v[-1], v[:-1])))


# Divide los datos en un conjunto de entrenamiento y test (70% - 30% respectivamente)
(trainData, testData) = data.randomSplit([0.7, 0.3])

# Entrena el modelo con el árbol de decisión.
model = DecisionTree.trainClassifier(
            trainData, numClasses=2, categoricalFeaturesInfo={1:12, 2:3, 3:4, 4:2, 6:2, 7:2, 8:3, 10:12, 15:4},
            impurity='entropy', maxDepth=3)			
			

# Evalua el modelo para saber el porcentaje de aciertos.
predictions = model.predict(testData.map(lambda lp : lp.features))

results = testData.map(lambda lp : lp.label).zip(predictions)

acc = (results.filter(lambda (v, p): v == p)
              .count()) / float(testData.count())
print('% Aciertos: ' + str(acc * 100))

# Calcula otras métricas
tp = results.filter(lambda (v, p): v == 1 and p == 1).count()
tn = results.filter(lambda (v, p): v == 0 and p == 0).count()
Exemplo n.º 57
0
#changing into a list
to_csv = rdd.map(lambda x: x.split(","))
to_csv.collect()

#splitting into train,test
train,test = to_csv.randomSplit([0.8,0.2],seed=42)

#converting the data to be optimised to be fed into our Decision Tree classfier
training_data = train.map(lambda x: LabeledPoint(x[8],array([x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7]])))
training_data.collect()

#extracting the labels
test_labels = test.map(lambda x: float(x[8]))
test_labels.collect()

model = DecisionTree.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=6, maxBins=40)

test_results = model.predict(test)
#print('Diabetic Predictions:')
#results = test_results.collect()
#for result in results:
#    print(result)

# We can also print out the decision tree itself:
print('Learned classification tree model:')
print(model.toDebugString())

#zipping the labels from test data (test_y) and the predictions made on test_X
labelsAndPredictions = test_labels.zip(test_results)
labelsAndPredictions.collect()
Exemplo n.º 58
0
# In[8]:

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
import utils_mesure
data = sc.textFile("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF_svm+".csv")

# suppression du header
nomColInit = data.first()
data2 = data.filter(lambda line: nomColInit != line) 
data = data2.map(utils_mesure.parseLine)

# Echantillonnage 60% entrainement et 40% test
training, test = data.randomSplit([0.6, 0.4], seed=0)
# Construction du modèle
model = DecisionTree.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=10, maxBins=32)
# Test 
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())


# In[8]:

# Mesures globales du mmodèle
utils_mesure.tabSum(labelsAndPredictions, 7, 'Decision Tree')

Exemplo n.º 59
0
        klass,
        (1 if age == 'adults' else 0),
        (1 if sex == 'women' else 0)
    ]
    return LabeledPoint(1 if survived == 'yes' else 0, features)

labeled_points_rdd = data_rdd.map(row_to_labeled_point)
labeled_points_rdd.takeSample(False, 5, 0)
training_rdd, test_rdd = labeled_points_rdd.randomSplit([0.7, 0.3], seed = 0)

training_count = training_rdd.count()
test_count = test_rdd.count()
training_count, test_count

model = DecisionTree.trainClassifier(
	training_rdd, 
	numClasses=2,
	categoricalFeaturesInfo={0: 3,1: 2,2: 2})

predictions_rdd = model.predict(test_rdd.map(lambda x: x.features))

truth_and_predictions_rdd = test_rdd.map(lambda lp: lp.label).zip(predictions_rdd)

accuracy = truth_and_predictions_rdd.filter(lambda v_p: v_p[0] == v_p[1]).count() / float(test_count)
print('Accuracy =', accuracy)
print(model.toDebugString())

model = LogisticRegressionWithSGD.train(training_rdd)

predictions_rdd = model.predict(test_rdd.map(lambda x: x.features))

labels_and_predictions_rdd = test_rdd.map(lambda lp: lp.label).zip(predictions_rdd)
Exemplo n.º 60
0
# In[ ]:

# In[53]:

(trainingData, testData) = fdata.randomSplit([0.8, 0.2])

# Use the decision tree classifier to train the model

# In[54]:

from pyspark.mllib.tree import DecisionTree

# In[55]:

model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=3,
                                     categoricalFeaturesInfo={})

# In[56]:

predictions = model.predict(testData.map(lambda row: row.features))

# Create Confusion Matrix to evaluate the accuracy of the model

# We create a matrix containing the test labels as a first column (real values) and predicted values as second column

# In[57]:

predictionsAndLabels = testData.map(
    lambda labeledpoint: labeledpoint.label).zip(predictions)