Python DecisionTreeModel примеры, pyspark.mllib.tree.DecisionTreeModel Python примеры использования

Пример #1

0

Показать файл

Файл: vestAccountMain.py Проект: yfliu87/VestAccountDetection

def loadModel():
	clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
	classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)

	if pv.outputDebugMsg:
		Utils.logMessage("\nLoad cluster & classification model finished")
	return clusterModel, classificationModel

Пример #2

0

Показать файл

Файл: score.py Проект: ayushsagar/big-data-analytics

def evaluate_model(type):
    if type == 'logistic':
        model = LogisticRegressionModel.load(sc, "logit_model.model")
    elif type == 'tree':
        model = DecisionTreeModel.load(sc, "dt_model.model")
    elif type == 'rf':
        model = RandomForestModel.load(sc, "rf_model.model")

Пример #3

0

Показать файл

Файл: MLProcessing.py Проект: Erin-Boehmer/MIDS_tinytags

def predict_proba(rf_model, testRDD):

        trees = rf_model._java_model.trees()
        ntrees = rf_model.numTrees()
        scores_dict = {i: 0 for i in range(0,10)}
        scoresRDD = testRDD.map(lambda x: scores_dict.copy())

        for tree in trees:
                dtm = DecisionTreeModel(tree)
                currentScoreRDD = dtm.predict(testRDD)
                scoresRDD = scoresRDD.zip(currentScoreRDD)

                def reduceTuple(x):
                        x[0][int(x[1])] += 1
                        return x[0]

                scoresRDD = scoresRDD.map(reduceTuple)
        return scoresRDD

Пример #4

0

Показать файл

Файл: DecisionTree.py Проект: cjzamora/machine-learning

    def saveModel(self):
        # save the model to the given path
        self.tree_model.save(self.sc, "trained")

        # re-load the saved model
        self.tree_model = DecisionTreeModel.load(self.sc, "trained")

        # re-evaluate
        self.evaluate()

Пример #5

0

Показать файл

Файл: score.py Проект: ayushsagar/big-data-analytics

def main(sc, filename):
    '''
    The driver for the spark scoring application, it generates predictions for
    a given file of features and target variables
    '''

    rawDataRdd = sc.textFile(filename)
    print "Data Size: {}".format(rawDataRdd.count())

    labeledPointsRdd = rawDataRdd.map(parse_lines)

    #load models
    logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
    dt_model = DecisionTreeModel.load(sc, "dt_model.model")
    rf_model = RandomForestModel.load(sc, "rf_model.model")

    #logistic predictions
    labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label  ))
    labels_and_preds_collected = labels_and_preds.collect()
    print "\n"
    print "Predictions: Logistic Regression"
    y_true = []
    y_pred = []
    for row in labels_and_preds_collected:
        y_true.append(row[1])
        y_pred.append(row[0])
        # print "predicted: {0} - actual: {1}\n".format(row[0], row[1])


    accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
    print_box()
    print "\n"

    #decision tree predictions
    predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
    labels_and_preds_dt_collected = labels_and_preds.collect()


    accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())

    print_box()
    print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
    print_box()
    print "\n"

    #random forest predictions
    predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
    labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
    accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
    print_box()
    print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
    print_box()

Пример #6

0

Показать файл

	def process(reviews):
		if(reviews.isEmpty()):
			pass
		else:
			model_name = "dt"
			updated_model = "dt0"
			model_path, data_path, metadata_path = '','',''
			
			#performing looping process to check the availability of new model classifier
			for i in range(25,-1,-1):
				model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
			model = DecisionTreeModel.load(sc, model_path)

			start = time.time()
			reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
			
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
			
			prediction = model.predict(tfidf)
			
			labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0]))
			
			metrics = MulticlassMetrics(labeled_prediction)	
			
			output = reviews.zip(prediction)
				
			filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out"
			output.saveAsTextFile(filename)
			
			end = time.time()	
			print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))

Пример #7

0

Показать файл

Файл: stargalaxy.py Проект: bbw7561135/POPREU

def get_probs_classify(model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)

Пример #8

0

Показать файл

Файл: hw4.py Проект: toosyou/big_data_analysis_hw

def get_dt_model(sc, train=None):
    model_path = 'dt.model'
    if train is None:
        model = DecisionTreeModel.load(sc, model_path)
    else:
        model = DecisionTree.trainClassifier(train,
                                             numClasses=2,
                                             categoricalFeaturesInfo={},
                                             impurity='gini',
                                             maxDepth=10)
        model.save(sc, model_path)

    return model

Пример #9

0

Показать файл

def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    featsAndPredictions = sc.parallelize([]) #empty RDD
    for i in range(ntrees):
        dtm = DecisionTreeModel(trees[i])
        predictions = dtm.predict(data.map(lambda x: x.features))
        featsAndPredictions=featsAndPredictions.union(data.map(lambda lp: lp.features).zip(predictions))

        #scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        #scores = scores.map(lambda x: x[0] + x[1])
        
    #add up the predictions and divide the accumulated scores over the number of trees
    return featsAndPredictions.reduceByKey(lambda a,b: a+b).map(lambda (key,val): (key,val/ntrees)) #add up the predictions

Пример #10

0

Показать файл

Файл: main.py Проект: LoadedCoders/iHear

def test(sc):
    files = ["sounds/flushing/20150227_193109-flushing-04.wav",
             "sounds/bike/20150227_193806-bici-14.wav",
             "sounds/blender/20150227_193606-licuadora-14.wav"
             ]

    rfmodel = RandomForestModel.load(sc, RF_PATH)
    dtmodel = DecisionTreeModel.load(sc, DT_PATH)

    print dtmodel.toDebugString()
    for f in files:
        vec = audio.showFeatures(f)
        testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
        print(vec)
        pred = dtmodel.predict(testfeatures)
        print("DT Prediction is " + str(pred), classes[int(pred)])
        pred = rfmodel.predict(testfeatures)
        print("RF Prediction is " + str(pred), classes[int(pred)])

Пример #11

0

Показать файл

Файл: new-catraca.py Проект: gta-ufrj/catraca

def getModel(path,file):
	
	if path_exist(path+'index-'+file):
		index=sc.sparkContext.textFile(path+'index-'+file)
		a=index.collect()
		b=lambda x : [ int(i) for i in x ]
		
		return DecisionTreeModel.load(sc, path+'model-'+file), b(a)

	else:
		vector,classes = dataPreparing(sc.sparkContext.textFile(path+file))
		index=CorrelationFeature(vector) #se precisar de feature do Feature Selection
		reduced=MatrixReducer(vector,index) 
		data=pass2libsvm(reduced,classes) 
		model = DecisionTree.trainClassifier(data, numberClasses, {})	 #, maxDepth=5, maxBins=32)
		model.save(sc, path+'model-'+file)			

		return	model, index

Пример #12

0

Показать файл

Файл: stargalaxy.py Проект: beatriceliang/POPREU

def get_probs_classify (model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1,ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])
    
    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x/ntrees)

Пример #13

0

Показать файл

def predict_proba(model, data):
    '''
    Input: A PySpark RandomForestModel object, RDD of LabeledPoints
    Output: List of probabilies 
    This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in xrange(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    probabilities = scores.map(lambda x: float(x) / ntrees).collect()
    return probabilities

Пример #14

0

Показать файл

Файл: server.py Проект: IcedNecro/AWO-61-backend

def init_spark_context():

    global predictionModel

    # load spark context
    conf = SparkConf().setAppName("movie_recommendation-server")

    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])

    # absolute path in hdfs
    # to run locally, remove first slash '/' i.e my_model1, not /my_model1

    predictionModel = DecisionTreeModel.load(sc, '/my_model1')
    sc.addFile( 'conv/6.p')
    sc.addFile( 'conv/7.p')
    sc.addFile( 'conv/8.p')
    sc.addFile('conv/10.p')
    sc.addFile('conv/12.p')
    sc.addFile( 'conv/36.p')

    return sc

Пример #15

0

Показать файл

def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        #data=pass2libsvm(vector)

        data = pass2libsvm(reduced, classes)

        #data=pass2libsvm(vector,classes)

        #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector)

        #(trainingData, testData) = data.randomSplit([0.7, 0.3])

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        model = DecisionTree.trainClassifier(data, numberClasses,
                                             {})  #, maxDepth=5, maxBins=32)

        model.save(sc, path + 'model-' + file)

        return model, index

Пример #16

0

Показать файл

Файл: Predict.py Проект: abhishek-ch/evolveML

def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel.
    '''  # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(
        lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType),
                     float(row.HistCTR)]))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)])))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)

Пример #17

0

Показать файл

Файл: decision_tree_classification_example.py Проект: 0xqq/spark

from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$

Пример #18

0

Показать файл

Файл: mllib-test.py Проект: Riuchando/Spark

         .setAppName("Mlib")
         .set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)



dv1 =np.array([1.0,0.0,3.0])
dv2= [1.0,0.0,3.0]
sv1 = Vectors.sparse(3,[0,2],[1.0,3.0])
sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1))

print sv2

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)


# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "model_data")
sameModel = DecisionTreeModel.load(sc, "model_data")

Пример #19

0

Показать файл

Файл: Classification_live_data.py Проект: SuperDev0403/Sentiment-Analysis-of-Twitter


conf = SparkConf()
conf.setAppName("TA")
sc = SparkContext(conf=conf)
tre = StreamingContext(sc, 10)
htf = HashingTF(50000)

NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes'
NB_model = NaiveBayesModel.load(sc, NB_directory)

LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression'
LR_model = LogisticRegressionModel.load(sc, LR_directory)

DT_output_dir = 'hdfs://master:9000/user/hadoop/DT'
DT_model = DecisionTreeModel.load(sc, DT_output_dir)

voted_classifier = VoteClassifier(NB_model, LR_model, DT_model)


def sentiment(test_sample):
    sample_data_test = test_sample.split(" ")
    cli = htf.transform(sample_data_test)
    return voted_classifier.classify(cli)


lines = tre.socketTextStream(socket.gethostbyname(socket.gethostname()), 10000)
lines.pprint()
tweets = lines.flatMap(lambda text: [(text)])
tweets.pprint()

Пример #20

0

Показать файл

Файл: decision_tree_classification.py Проект: bmewing/spark_vs_r

	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08")
sc.stop ()

Пример #21

0

Показать файл

Файл: simulation.py Проект: PRONGS-CHIRAG/IPL-Match-Simulator

from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.regression import LabeledPoint
from numpy import array
sc.stop()
if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")
    sc.setLogLevel("ERROR")
    model1 = DecisionTreeModel.load(sc, "runs")
    model2 = DecisionTreeModel.load(sc, "wickets")
    batsmen_cluster = {}
    bowler_cluster = {}
    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_batsmen.csv'
    ) as f:
        for line in f:
            ar = line.split(',')
            a = []
            a.append(int(ar[0]))
            a.append(float(ar[3]))
            a.append(float(ar[4]))
            batsmen_cluster[ar[2]] = a

    with open(
            '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_bowler.csv'
    ) as f:

Пример #22

0

Показать файл

Файл: hello.py Проект: ivanybma/295_Leach_wsn_attack_detect_part

import json
import requests
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from numpy import array
app = Flask(__name__)
conf = SparkConf()
conf.setAppName("Classification")
try:
	sc.stop()
except:
	pass
sc = SparkContext(pyFiles=['/home/ubuntu/project_src/flaskapp/createLabeledPoint.py','/home/ubuntu/project_src/flaskapp/ClassSet.py','/home/ubuntu/project_src/flaskapp/FuncSet.py','/home/ubuntu/project_src/flaskapp/hello.py']).getOrCreate(conf=conf)
#testm = DecisionTreeModel.load(sc, "hdfs://*****:*****@app.route('/')
def hello_world():
	return 'From python hello!'
@app.route('/index')
def index():
	return render_template("index.html")
@app.route('/train')
def trainodule():
	pass
@app.route('/getSpkTstCnt')
def runclass():
#	testm = DecisionTreeModel.load(sc, "hdfs://ip-172-31-1-239:9000/home/ubuntu/project_src/tree_model")
	test_data_file = "hdfs://ip-172-31-1-239:9000/user/ubuntu/corrected.gz"
	test_raw_data = sc.textFile(test_data_file)

Пример #23

0

Показать файл

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData,
                                        categoricalFeaturesInfo={},
                                        impurity='variance',
                                        maxDepth=5,
                                        maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$

Пример #24

0

Показать файл

Файл: decision_tree.py Проект: shashankadidamu/OttoGroupClassification

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")

Пример #25

0

Показать файл

 def getModel(self, path):
     if self.type == 'NaiveBayes':
         return NaiveBayesModel.load(self.sc, path)
     elif self.type == 'DecisionTree':
         return DecisionTreeModel.load(self.sc, path)

Пример #26

0

Показать файл

Файл: decision_tree_classification-narrow.py Проект: bmewing/spark_vs_r

	
	#Cancelled becomes the 6th column now, and total columns in the data = 6
	label = clean_line_split[5]
	nonLable = clean_line_split[0:5]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08")
sc.stop ()

Пример #27

0

Показать файл

import csv
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel


# Remove the first line from the csv file
def clean(x):
    if (x[29] != "Amount"):
        return x


#Turn the data into a labeled point using 30 dimensions
def normalize(x):
    return LabeledPoint(float(x[30]), [float(x[0]), float(x[29]) / 25691.16])


sameModel = DecisionTreeModel.load(sc, "./decisiontreefraud")

#make a spark conference
conf = (SparkConf().setMaster("local").setAppName("My app").set(
    "spark.executor.memory", "4g"))

#files have to be added while running to see the data in the stream
ssc = StreamingContext(sc, 1)
lines1 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/b")
trainingData = lines1.map(lambda line: LabeledPoint(float(line.split(" ")[
    1]), [(line.split(" ")[0]), (line.split(" ")[2])])).cache()
trainingData.pprint()

lines2 = ssc.textFileStream(
    "file:///mnt/vdatanodea/datasets/creditcards/credit/c")

Пример #28

0

Показать файл

def getModel(path, file):

    if path_exist(path + 'index-' + file):
        index = sc.textFile(path + 'index-' + file)
        a = index.collect()
        b = lambda x: [int(i) for i in x]

        return DecisionTreeModel.load(sc, path + 'model-' + file), b(a)

    else:

        vector, classes = dataPreparing(sc.textFile(path + file))

        index = CorrelationFeature(
            vector)  #se precisar de feature do Feature Selection

        reduced = MatrixReducer(vector, index)

        data = pass2libsvm(reduced, classes)

        # Train a DecisionTree model.
        #  Empty categoricalFeaturesInfo indicates all features are continuous.

        # Load CSV data
        data2 = spark.read.format("csv").schema(schema).load(path + file)

        # Create vector assembler to produce a feature vector for each record for use in MLlib
        # First 45 csv fields are features, the 46th field is the label. Remove IPs from features.
        assembler = VectorAssembler(inputCols=[schema.names[1]] +
                                    schema.names[3:-1],
                                    outputCol="features")

        # Assemble feature vector in new dataframe
        assembledData = assembler.transform(data2)

        # Create a label and feature indexers to speed up categorical columns for decision tree
        labelIndexer = StringIndexer(inputCol="label",
                                     outputCol="indexedLabel")
        labelIndexed = labelIndexer.fit(assembledData).transform(assembledData)
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures",
                                       maxCategories=20)
        featureIndexed = featureIndexer.fit(labelIndexed).transform(
            labelIndexed)

        # Create a DecisionTree model trainer
        dt = DecisionTreeClassifier(labelCol="indexedLabel",
                                    featuresCol="indexedFeatures")

        # Chain indexers and model training in a Pipeline
        #		pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

        # Train model
        #		model = pipeline.fit(assembledData)
        model = dt.fit(featureIndexed)

        #model = DecisionTree.trainClassifier(data, numberClasses,{})	 #, maxDepth=5, maxBins=32)

        #model.save(sc, path+'model-'+file)

        return model, index

Пример #29

0

Показать файл

    def selectNext(self):
        # predict for the rest the datapoints
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        actualIndices = self.trainDataUnknown.map(lambda _ : _[0])\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))

        myDebugger.TIMESTAMP('zipping indices ')

        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            '''
             zipping each prediction from each decision tree
             with individual sample index so that they can be
             added later
            '''
            predX = DecisionTreeModel(x)\
                .predict(self.trainDataUnknown.map(lambda _ : _[1].features))\
                .zipWithIndex()\
                .map(lambda _: (_[1], _[0]))

            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)

        myDebugger.TIMESTAMP('get individual tree predictions')
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        classPrediction = rdd.groupByKey().mapValues(sum)

        myDebugger.TIMESTAMP('reducing ')

        #  direct self.nEstimators gives error
        totalEstimators = self.nEstimators
        #  predicted probability of class 0
        classPrediction = classPrediction.map(
            lambda _: (_[0], abs(0.5 - (1 - (_[1] / totalEstimators)))))

        myDebugger.TIMESTAMP('mapping')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = classPrediction.sortBy(lambda _: _[1]).first()[0]

        myDebugger.TIMESTAMP('sorting')

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        myDebugger.TIMESTAMP('update known indices')

        # removing first sample from unlabeled ones(update)
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)

        myDebugger.TIMESTAMP('update unknown indices')

        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())

        myDebugger.TIMESTAMP('DEBUGGING DONE')

Пример #30

0

Показать файл

Файл: decision_tree_regression.py Проект: bsangee/spark_vs_r

parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(training,
                                    categoricalFeaturesInfo={},
                                    impurity='variance',
                                    maxDepth=5,
                                    maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print('Time consumed = '), (datetime.now() - startTime)

print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load(sc, "DTR-Narrow-2008")
sc.stop()

Пример #31

0

Показать файл

Файл: tests.py Проект: HodaAlemi/spark

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass

Пример #32

0

Показать файл

Файл: decision_tree_classification_example.py Проект: xmas1992/DecisionTreeShareProject

    data = MLUtils.loadLibSVMFile(sc, dataPath)
    # 将数据集分割为训练数据集和测试数据集
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    print("train data count: " + str(trainingData.count()))
    print("test data count : " + str(testData.count()))

    # 训练决策树分类器
    # categoricalFeaturesInfo 为空，表示所有的特征均为连续值
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # 测试数据集上预测
    predictions = model.predict(testData.map(lambda x: x.features))
    # 打包真实值与预测值
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    # 统计预测错误的样本的频率
    testErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(testData.count())
    print('Decision Tree Test Error = %5.3f%%' % (testErr * 100))
    print("Decision Tree Learned classifiction tree model : ")
    print(model.toDebugString())

    # 保存和加载训练好的模型
    modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myDecisionTreeClassificationModel"
    model.save(sc, modelPath)
    sameModel = DecisionTreeModel.load(sc, modelPath)

Пример #33

0

Показать файл

Файл: decision_tree_classification_example.py Проект: zdd199212/spark-2.2

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData,
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         impurity='gini',
                                         maxDepth=5,
                                         maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
    sameModel = DecisionTreeModel.load(
        sc, "target/tmp/myDecisionTreeClassificationModel")
    # $example off$

Пример #34

0

Показать файл

Файл: extractsample.py Проект: ivanybma/295_Leach_wsn_attack_detect_part

from pyspark import SparkConf, SparkContext
import urllib.request
import urllib
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from time import time
import createLabeledPoint
from createLabeledPoint import *
try:
	sc.stop()
except:
	pass
sc = SparkContext().getOrCreate(SparkConf())
testm = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_model")
testm_portsweep = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_portsweep_model")
test_data_file = "./corrected.gz"
test_raw_data = sc.textFile(test_data_file)

typename = test_raw_data.filter(lambda x: 'portsweep' in x)
cur=0
idx=0
count = typename.count()
for idx in range(count):
	typename_pd = typename.zipWithIndex().filter(lambda x: x[1]==idx).map(lambda x: x[0])
	test_csv_data = typename_pd.map(lambda x: x.split(","))
	test_data = test_csv_data.map(create_labeled_point)
	predictions = testm_portsweep.predict(test_data.map(lambda p: p.features))
	if str(predictions.take(1)[0]) == "3.0":
		print(typename_pd.collect())
		cur=cur+1

Пример #35

0

Показать файл

Файл: decision_tree_classification.py Проект: bsangee/spark_vs_r

parsedData = raw_data.map(parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    test.count())
print('Time consumed = '), (datetime.now() - startTime)

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

#save and load model
model.save(sc, "DT-Class-N-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-00-08")
sc.stop()

Пример #36

0

Показать файл

# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Decision Tree Classification').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load and parse the data file into an RDD of LabelPoint
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')

#split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#train a decision tree model
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x : x.features))
labelAndPredictions = testData.map(lambda lp : lp.label).zip(predictions)
testErr = labelAndPredictions.filter(lambda (v, p) : v != p).count()/float(testData.count())

print('test err' + str(testErr))
print('learned classification tree model :' + str(model.toDebugString))

# save and load model
model.save(sc, '../model/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeClassificationModel')

sc.stop()

Пример #37

0

Показать файл

Файл: decision_tree_regression-wide.py Проект: bsangee/spark_vs_r

	label = clean_line_split[10]
	nonLable = clean_line_split[0:10] + clean_line_split[11:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")

Пример #38

0

Показать файл

Файл: decision_tree_regression_example.py Проект: 0xqq/spark

# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        impurity='variance', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
        float(testData.count())
    print('Test Mean Squared Error = ' + str(testMSE))
    print('Learned regression tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
    sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
    # $example off$

Пример #39

0

Показать файл

    def selectNext(self):
        # get predictions from individual trees
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        # zipping actual indices with dummy indices so that they can be traced later
        actualIndices = self.trainDataUnknown.map(lambda _: _[0]) \
            .zipWithIndex() \
            .map(lambda _: (_[1], _[0]))

        # an empty RDD
        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            # zipping each prediction from each decision tree with individual sample index so that they can be added later
            predX = DecisionTreeModel(x) \
                .predict(self.trainDataUnknown.map(lambda _: _[1].features)) \
                .zipWithIndex() \
                .map(lambda _: (_[1], _[0]))
            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        sumScore = rdd.groupByKey().mapValues(sum)
        totalEstimators = self.nEstimators

        # average of the predicted scores
        f_1 = sumScore.map(lambda _: (_[0], _[1] / totalEstimators))

        # standard deviation of predicted scores
        f_2 = sumScore.map(lambda _: getSD(_, totalEstimators))

        # - proportion of positive points
        nLabeled = self.trainDataKnown.count()
        nUnlabeled = self.trainDataUnknown.count()
        proportionPositivePoints = (self.trainDataKnown.map(
            lambda _: _[1].label).reduce(lambda x, y: x + y)) / nLabeled
        f_3 = f_1.map(lambda _: proportionPositivePoints)

        # - estimate variance of forest by looking at avergae of variance of some predictions
        estimateVariance = (
            f_2.map(lambda _: _[1]).reduce(lambda x, y: x + y)) / nUnlabeled
        f_6 = f_3.map(lambda _: estimateVariance)

        # - number of already labelled datapoints
        f_8 = f_3.map(lambda _: nLabeled)

        myDebugger.TIMESTAMP('features ready for transposing')

        # transposing start
        tempf_1 = f_1.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_2 = f_2.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_3 = f_3.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_6 = f_6.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_8 = f_8.zipWithIndex().map(lambda _: (_[1], _[0]))
        LALDataset = tempf_1\
            .leftOuterJoin(tempf_2)\
            .leftOuterJoin(tempf_3)\
            .leftOuterJoin(tempf_6)\
            .leftOuterJoin(tempf_8)\
            .map(lambda _  : LabeledPoint(_[0] ,
                              [_[1][0][0][0][0],  _[1][0][0][0][1],  _[1][0][0][1], _[1][0][1], _[1][1]]))

        myDebugger.TIMESTAMP('transposing done')

        # # predict the expected reduction in the error by adding the point
        LALprediction = self.lalModel.predict(LALDataset.map(lambda _ : _.features))\
            .zipWithIndex()\
            .map(lambda _ : (_[1],_[0]))
        myDebugger.TIMESTAMP('prediction done')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = LALprediction.sortBy(lambda _: _[1]).max()[0]

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        # updating unknown indices
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)
        ''' debugging block '''
        myDebugger.TIMESTAMP('update unknown indices')
        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())
        myDebugger.TIMESTAMP('DEBUGGING DONE')

Пример #40

0

Показать файл

Файл: loadModel.py Проект: hims209/cloudcomputing

from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml import Pipeline
from pyspark.mllib.tree import DecisionTreeModel
sc = SparkContext()
spark = SparkSession(sc)

inputDF = spark.read.csv('s3://himaniproject2/ValidationDataset.csv',
                         header='true',
                         inferSchema='true',
                         sep=';')

transformed_df = inputDF.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

model = DecisionTreeModel.load(sc, "s3://himaniproject2/model")

predictions = model.predict(transformed_df.map(lambda x: x.features))

labels_and_predictions = transformed_df.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(
    transformed_df.count())
print(".........................................................")
print("Model accuracy....................: %.3f%%" % (acc * 100))

metrics = MulticlassMetrics(labels_and_predictions)

fscore = metrics.fMeasure()
print(".........................................................")
print("F1 Score.................................. = %s" % fscore)

Пример #41

0

Показать файл

Файл: decisiontree_classify_test.py Проект: wangcunxin/spark_py

            .setMaster(master)
            .setAppName(app_name))

    sc = SparkContext(conf=conf)
    lines = sc.textFile(input)
    parsedData = lines.map(parseLine)
    (trainingData, testData) = parsedData.randomSplit([0.5, 0.5])

    # Train a DecisionTree model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    predictions.foreach(my_print)

    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    labelsAndPredictions.foreach(my_print)

    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification tree model:')
    print(model.toDebugString())

    # Save and load model
    model.save(sc, output)
    sameModel = DecisionTreeModel.load(sc, output)


    sc.stop()

Пример #42

0

Показать файл

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(
    sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt'
)  # The code on web is wrong, this is correct.
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     impurity='gini',
                                     maxDepth=5,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(
    testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
sameModel = DecisionTreeModel.load(
    sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')

Пример #43

0

Показать файл

Файл: Classifier.py Проект: aprando/master-thesis-social-recsys

	def getModel(self, path):
		if self.type == 'NaiveBayes':
			return NaiveBayesModel.load(self.sc, path)
		elif self.type == 'DecisionTree':
			return DecisionTreeModel.load(self.sc, path)

Пример #44

0

Показать файл

Файл: decision_tree_classification.py Проект: bsangee/spark_vs_r

	#Cancelled becomes the 9th column now, and total columns in the data = 9
	label = clean_line_split[8]
	nonLable = clean_line_split[0:8]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()

#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())

#save and load model
model.save(sc, "DT-Class-W-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-95-08")
sc.stop ()

Пример #45

0

Показать файл

Файл: sub_binladen_retweets_list.py Проект: zhuangkechen/midm

    print "######################################################\n"
    print "######################################################\n"
    print "#########            Start!!!                  #######\n"
    print "######################################################\n"
    print "######################################################\n"
    print "\n\n\n"
    #stop_rdd = rdd_tweets.coalesce(1)
    #stop_rdd.saveAsTextFile(output_path)
    print "****************************************************\n"
    print "Here is the last step\n"
    print "****************************************************\n"



    #Here is the trainning steps.
    binladen_model = DecisionTreeModel.load(sc, binladen_model_path)
    #
    #training_data = MLUtils.loadLibSVMFile(sc, training_path)
    test_data = rdd_labelFeatures
    # Evaluate model on test instances and compute test error
    predictions = binladen_model.predict(test_data.map(lambda x: x.features))
    # test the error value
    labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count())
    tmp_str = 'Test Error = ' + str(testErr)
    print(tmp_str)
    log_write(tmp_str)
    print "\n\n"

    #featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\
    #        .zip(predictions)

Пример #46

0

Показать файл

Файл: decision_tree_regression.py Проект: bsangee/spark_vs_r

	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
sc.stop ()

Пример #47

0

Показать файл

Файл: flight_prediction_job.py Проект: schevalier/insightedge-python-demo

                         scheduled_departure_time=t[1].scheduled_departure_time,
                         actual_departure_time=t[1].actual_departure_time,
                         departure_delay_minutes=t[1].departure_delay_minutes,
                         scheduled_arrival_time=t[1].scheduled_arrival_time,
                         actual_arrival_time=t[1].actual_arrival_time,
                         arrival_delay_minutes=t[1].arrival_delay_minutes,
                         crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes,
                         distance=t[1].distance)


if __name__ == "__main__":
    sc = SparkContext(appName="InsightEdge Python API Demo: prediction job")
    ssc = StreamingContext(sc, 3)
    sqlc = SQLContext(sc)

    zkQuorum = "localhost:2181"
    topic = "flights"

    model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc))

    carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc))
    origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc))
    destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc))

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.foreachRDD(predict_and_save)

    ssc.start()
    ssc.awaitTermination()

Пример #48

0

Показать файл

Файл: decision_tree_regression.py Проект: bsangee/spark_vs_r

	nonLable = clean_line_split[1:]
	return LabeledPoint (label, nonLable)

parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])

#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
                                         impurity='variance', maxDepth=5, maxBins=32)

#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
    float(testData.count())

print ('Time consumed = '), (datetime.now() - startTime)

print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())

#save and load model
model.save (sc, "DTR-Narrow-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Narrow-2008")
sc.stop ()

Пример #49

0

Показать файл

                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=n_estimators,
                                         featureSubsetStrategy="auto",
                                         impurity='gini')
    ''' accuracy test on testset here'''
    predictions = model.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1])

    n_unlabeled = unlabeled_data.count()

    rdd = sc.parallelize([])
    for tree in model._java_model.trees():
        predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))
        rdd = rdd.union(predX)

    classPrediction = rdd.groupByKey().mapValues(sum)
    classPrediction = classPrediction.sortByKey()
    entropies = classPrediction.map(lambda _: abs(0.5 -
                                                  (1 - (_[1] / n_estimators))))

    unlabeled_entropies = unlabeled_indices.map(lambda _: _[0])\
        .zipWithIndex()\
        .map(lambda _: (_[1], _[0]))\
        .leftOuterJoin(entropies.zipWithIndex().map(lambda _:(_[1], _[0])))\
        .map(lambda _:_[1])

    sorted_unlabeled_entropies = unlabeled_entropies.sortBy(lambda _: _[1])

Пример #50

0

Показать файл

Файл: test_algorithms.py Проект: Ignalina/spark311

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass

Пример #51

0

Показать файл

from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("TaxiWeb")
sc = SparkContext(conf=conf)
model = DecisionTreeModel.load(sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")

Python DecisionTreeModel примеры использования