def loadModel(): clusterModel = KMeansModel.load(sc, pv.clusterModelPath) classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath) if pv.outputDebugMsg: Utils.logMessage("\nLoad cluster & classification model finished") return clusterModel, classificationModel
def evaluate_model(type): if type == 'logistic': model = LogisticRegressionModel.load(sc, "logit_model.model") elif type == 'tree': model = DecisionTreeModel.load(sc, "dt_model.model") elif type == 'rf': model = RandomForestModel.load(sc, "rf_model.model")
def predict_proba(rf_model, testRDD): trees = rf_model._java_model.trees() ntrees = rf_model.numTrees() scores_dict = {i: 0 for i in range(0,10)} scoresRDD = testRDD.map(lambda x: scores_dict.copy()) for tree in trees: dtm = DecisionTreeModel(tree) currentScoreRDD = dtm.predict(testRDD) scoresRDD = scoresRDD.zip(currentScoreRDD) def reduceTuple(x): x[0][int(x[1])] += 1 return x[0] scoresRDD = scoresRDD.map(reduceTuple) return scoresRDD
def saveModel(self): # save the model to the given path self.tree_model.save(self.sc, "trained") # re-load the saved model self.tree_model = DecisionTreeModel.load(self.sc, "trained") # re-evaluate self.evaluate()
def main(sc, filename): ''' The driver for the spark scoring application, it generates predictions for a given file of features and target variables ''' rawDataRdd = sc.textFile(filename) print "Data Size: {}".format(rawDataRdd.count()) labeledPointsRdd = rawDataRdd.map(parse_lines) #load models logit_model = LogisticRegressionModel.load(sc, "logit_model.model") dt_model = DecisionTreeModel.load(sc, "dt_model.model") rf_model = RandomForestModel.load(sc, "rf_model.model") #logistic predictions labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label )) labels_and_preds_collected = labels_and_preds.collect() print "\n" print "Predictions: Logistic Regression" y_true = [] y_pred = [] for row in labels_and_preds_collected: y_true.append(row[1]) y_pred.append(row[0]) # print "predicted: {0} - actual: {1}\n".format(row[0], row[1]) accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4)) print_box() print "\n" #decision tree predictions predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features)) labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions) labels_and_preds_dt_collected = labels_and_preds.collect() accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4)) print_box() print "\n" #random forest predictions predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features)) labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf) accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count()) print_box() print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4)) print_box()
def process(reviews): if(reviews.isEmpty()): pass else: model_name = "dt" updated_model = "dt0" model_path, data_path, metadata_path = '','','' #performing looping process to check the availability of new model classifier for i in range(25,-1,-1): model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i) updated_model = model_name+str(i) data_path = model_path+"/data/part-r*" metadata_path = model_path+"/metadata/part-00000" if(patherror(data_path) == False and patherror(metadata_path) == False): break #load model classifier model = DecisionTreeModel.load(sc, model_path) start = time.time() reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #review tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) prediction = model.predict(tfidf) labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0])) metrics = MulticlassMetrics(labeled_prediction) output = reviews.zip(prediction) filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out" output.saveAsTextFile(filename) end = time.time() print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
def get_probs_classify(model, data): # Collect the individual decision trees as JavaArray objects trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data) # For each tree, apply its prediction to the entire dataset and zip together the results for i in range(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data)) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x / ntrees)
def get_dt_model(sc, train=None): model_path = 'dt.model' if train is None: model = DecisionTreeModel.load(sc, model_path) else: model = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=10) model.save(sc, model_path) return model
def predict_proba(rf_model, data): ''' This wrapper overcomes the "binary" nature of predictions in the native RandomForestModel. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = rf_model._java_model.trees() ntrees = rf_model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features)) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. featsAndPredictions = sc.parallelize([]) #empty RDD for i in range(ntrees): dtm = DecisionTreeModel(trees[i]) predictions = dtm.predict(data.map(lambda x: x.features)) featsAndPredictions=featsAndPredictions.union(data.map(lambda lp: lp.features).zip(predictions)) #scores = scores.zip(dtm.predict(data.map(lambda x: x.features))) #scores = scores.map(lambda x: x[0] + x[1]) #add up the predictions and divide the accumulated scores over the number of trees return featsAndPredictions.reduceByKey(lambda a,b: a+b).map(lambda (key,val): (key,val/ntrees)) #add up the predictions
def test(sc): files = ["sounds/flushing/20150227_193109-flushing-04.wav", "sounds/bike/20150227_193806-bici-14.wav", "sounds/blender/20150227_193606-licuadora-14.wav" ] rfmodel = RandomForestModel.load(sc, RF_PATH) dtmodel = DecisionTreeModel.load(sc, DT_PATH) print dtmodel.toDebugString() for f in files: vec = audio.showFeatures(f) testfeatures = Vectors.dense([float(x) for x in vec.split(' ')]) print(vec) pred = dtmodel.predict(testfeatures) print("DT Prediction is " + str(pred), classes[int(pred)]) pred = rfmodel.predict(testfeatures) print("RF Prediction is " + str(pred), classes[int(pred)])
def getModel(path,file): if path_exist(path+'index-'+file): index=sc.sparkContext.textFile(path+'index-'+file) a=index.collect() b=lambda x : [ int(i) for i in x ] return DecisionTreeModel.load(sc, path+'model-'+file), b(a) else: vector,classes = dataPreparing(sc.sparkContext.textFile(path+file)) index=CorrelationFeature(vector) #se precisar de feature do Feature Selection reduced=MatrixReducer(vector,index) data=pass2libsvm(reduced,classes) model = DecisionTree.trainClassifier(data, numberClasses, {}) #, maxDepth=5, maxBins=32) model.save(sc, path+'model-'+file) return model, index
def get_probs_classify (model, data): # Collect the individual decision trees as JavaArray objects trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data) # For each tree, apply its prediction to the entire dataset and zip together the results for i in range(1,ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data)) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x/ntrees)
def predict_proba(model, data): ''' Input: A PySpark RandomForestModel object, RDD of LabeledPoints Output: List of probabilies This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict( data.map(lambda x: x.features)) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. for i in xrange(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data.map(lambda x: x.features))) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees probabilities = scores.map(lambda x: float(x) / ntrees).collect() return probabilities
def init_spark_context(): global predictionModel # load spark context conf = SparkConf().setAppName("movie_recommendation-server") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py']) # absolute path in hdfs # to run locally, remove first slash '/' i.e my_model1, not /my_model1 predictionModel = DecisionTreeModel.load(sc, '/my_model1') sc.addFile( 'conv/6.p') sc.addFile( 'conv/7.p') sc.addFile( 'conv/8.p') sc.addFile('conv/10.p') sc.addFile('conv/12.p') sc.addFile( 'conv/36.p') return sc
def getModel(path, file): if path_exist(path + 'index-' + file): index = sc.textFile(path + 'index-' + file) a = index.collect() b = lambda x: [int(i) for i in x] return DecisionTreeModel.load(sc, path + 'model-' + file), b(a) else: vector, classes = dataPreparing(sc.textFile(path + file)) index = CorrelationFeature( vector) #se precisar de feature do Feature Selection reduced = MatrixReducer(vector, index) #data=pass2libsvm(vector) data = pass2libsvm(reduced, classes) #data=pass2libsvm(vector,classes) #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector) #(trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(data, numberClasses, {}) #, maxDepth=5, maxBins=32) model.save(sc, path + 'model-' + file) return model, index
def predict_proba(rf_model, data): ''' This wrapper overcomes the "binary" nature of predictions in the native RandomForestModel. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = rf_model._java_model.trees() ntrees = rf_model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data.map( lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType), float(row.HistCTR)])) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. for i in range(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)]))) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x / ntrees)
from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeClassificationModel") sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") # $example off$
.setAppName("Mlib") .set("spark.executor.memory", "1g")) sc = SparkContext(conf = conf) dv1 =np.array([1.0,0.0,3.0]) dv2= [1.0,0.0,3.0] sv1 = Vectors.sparse(3,[0,2],[1.0,3.0]) sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1)) print sv2 data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "model_data") sameModel = DecisionTreeModel.load(sc, "model_data")
conf = SparkConf() conf.setAppName("TA") sc = SparkContext(conf=conf) tre = StreamingContext(sc, 10) htf = HashingTF(50000) NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes' NB_model = NaiveBayesModel.load(sc, NB_directory) LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression' LR_model = LogisticRegressionModel.load(sc, LR_directory) DT_output_dir = 'hdfs://master:9000/user/hadoop/DT' DT_model = DecisionTreeModel.load(sc, DT_output_dir) voted_classifier = VoteClassifier(NB_model, LR_model, DT_model) def sentiment(test_sample): sample_data_test = test_sample.split(" ") cli = htf.transform(sample_data_test) return voted_classifier.classify(cli) lines = tre.socketTextStream(socket.gethostbyname(socket.gethostname()), 10000) lines.pprint() tweets = lines.flatMap(lambda text: [(text)]) tweets.pprint()
#Cancelled becomes the 9th column now, and total columns in the data = 9 label = clean_line_split[8] nonLable = clean_line_split[0:8] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-W-00-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08") sc.stop ()
from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.regression import LabeledPoint from numpy import array sc.stop() if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeRegressionExample") sc.setLogLevel("ERROR") model1 = DecisionTreeModel.load(sc, "runs") model2 = DecisionTreeModel.load(sc, "wickets") batsmen_cluster = {} bowler_cluster = {} with open( '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_batsmen.csv' ) as f: for line in f: ar = line.split(',') a = [] a.append(int(ar[0])) a.append(float(ar[3])) a.append(float(ar[4])) batsmen_cluster[ar[2]] = a with open( '/home/anup/Downloads/hopeyoudontforwardthistoanyone/cluster_bowler.csv' ) as f:
import json import requests from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from numpy import array app = Flask(__name__) conf = SparkConf() conf.setAppName("Classification") try: sc.stop() except: pass sc = SparkContext(pyFiles=['/home/ubuntu/project_src/flaskapp/createLabeledPoint.py','/home/ubuntu/project_src/flaskapp/ClassSet.py','/home/ubuntu/project_src/flaskapp/FuncSet.py','/home/ubuntu/project_src/flaskapp/hello.py']).getOrCreate(conf=conf) #testm = DecisionTreeModel.load(sc, "hdfs://*****:*****@app.route('/') def hello_world(): return 'From python hello!' @app.route('/index') def index(): return render_template("index.html") @app.route('/train') def trainodule(): pass @app.route('/getSpkTstCnt') def runclass(): # testm = DecisionTreeModel.load(sc, "hdfs://ip-172-31-1-239:9000/home/ubuntu/project_src/tree_model") test_data_file = "hdfs://ip-172-31-1-239:9000/user/ubuntu/corrected.gz" test_raw_data = sc.textFile(test_data_file)
sc = SparkContext(appName="PythonDecisionTreeRegressionExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeRegressionModel") sameModel = DecisionTreeModel.load( sc, "target/tmp/myDecisionTreeRegressionModel") # $example off$
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'file') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "myModelPath") sameModel = DecisionTreeModel.load(sc, "myModelPath")
def getModel(self, path): if self.type == 'NaiveBayes': return NaiveBayesModel.load(self.sc, path) elif self.type == 'DecisionTree': return DecisionTreeModel.load(self.sc, path)
#Cancelled becomes the 6th column now, and total columns in the data = 6 label = clean_line_split[5] nonLable = clean_line_split[0:5] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-N-95-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08") sc.stop ()
import csv from pyspark.mllib.tree import DecisionTree, DecisionTreeModel # Remove the first line from the csv file def clean(x): if (x[29] != "Amount"): return x #Turn the data into a labeled point using 30 dimensions def normalize(x): return LabeledPoint(float(x[30]), [float(x[0]), float(x[29]) / 25691.16]) sameModel = DecisionTreeModel.load(sc, "./decisiontreefraud") #make a spark conference conf = (SparkConf().setMaster("local").setAppName("My app").set( "spark.executor.memory", "4g")) #files have to be added while running to see the data in the stream ssc = StreamingContext(sc, 1) lines1 = ssc.textFileStream( "file:///mnt/vdatanodea/datasets/creditcards/credit/b") trainingData = lines1.map(lambda line: LabeledPoint(float(line.split(" ")[ 1]), [(line.split(" ")[0]), (line.split(" ")[2])])).cache() trainingData.pprint() lines2 = ssc.textFileStream( "file:///mnt/vdatanodea/datasets/creditcards/credit/c")
def getModel(path, file): if path_exist(path + 'index-' + file): index = sc.textFile(path + 'index-' + file) a = index.collect() b = lambda x: [int(i) for i in x] return DecisionTreeModel.load(sc, path + 'model-' + file), b(a) else: vector, classes = dataPreparing(sc.textFile(path + file)) index = CorrelationFeature( vector) #se precisar de feature do Feature Selection reduced = MatrixReducer(vector, index) data = pass2libsvm(reduced, classes) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Load CSV data data2 = spark.read.format("csv").schema(schema).load(path + file) # Create vector assembler to produce a feature vector for each record for use in MLlib # First 45 csv fields are features, the 46th field is the label. Remove IPs from features. assembler = VectorAssembler(inputCols=[schema.names[1]] + schema.names[3:-1], outputCol="features") # Assemble feature vector in new dataframe assembledData = assembler.transform(data2) # Create a label and feature indexers to speed up categorical columns for decision tree labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndexed = labelIndexer.fit(assembledData).transform(assembledData) featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=20) featureIndexed = featureIndexer.fit(labelIndexed).transform( labelIndexed) # Create a DecisionTree model trainer dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and model training in a Pipeline # pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model # model = pipeline.fit(assembledData) model = dt.fit(featureIndexed) #model = DecisionTree.trainClassifier(data, numberClasses,{}) #, maxDepth=5, maxBins=32) #model.save(sc, path+'model-'+file) return model, index
def selectNext(self): # predict for the rest the datapoints self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \ .leftOuterJoin(self.dataset.trainSet) \ .map(lambda _: (_[0], _[1][1])) actualIndices = self.trainDataUnknown.map(lambda _ : _[0])\ .zipWithIndex()\ .map(lambda _: (_[1], _[0])) myDebugger.TIMESTAMP('zipping indices ') rdd = sc.parallelize([]) ''' these java objects are not serializable thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ''' for x in self.model._java_model.trees(): ''' zipping each prediction from each decision tree with individual sample index so that they can be added later ''' predX = DecisionTreeModel(x)\ .predict(self.trainDataUnknown.map(lambda _ : _[1].features))\ .zipWithIndex()\ .map(lambda _: (_[1], _[0])) predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1]) rdd = rdd.union(predX) myDebugger.TIMESTAMP('get individual tree predictions') ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s''' classPrediction = rdd.groupByKey().mapValues(sum) myDebugger.TIMESTAMP('reducing ') # direct self.nEstimators gives error totalEstimators = self.nEstimators # predicted probability of class 0 classPrediction = classPrediction.map( lambda _: (_[0], abs(0.5 - (1 - (_[1] / totalEstimators))))) myDebugger.TIMESTAMP('mapping') # Selecting the index which has the highest uncertainty/ closest to probability 0.5 selectedIndex1toN = classPrediction.sortBy(lambda _: _[1]).first()[0] myDebugger.TIMESTAMP('sorting') # takes the selectedIndex from the unknown samples and add it to the known ones self.indicesKnown = self.indicesKnown.union( sc.parallelize([selectedIndex1toN])) myDebugger.TIMESTAMP('update known indices') # removing first sample from unlabeled ones(update) self.indicesUnknown = self.indicesUnknown.filter( lambda _: _ != selectedIndex1toN) myDebugger.TIMESTAMP('update unknown indices') myDebugger.DEBUG(selectedIndex1toN) myDebugger.DEBUG(self.indicesKnown.collect()) myDebugger.DEBUG(self.indicesUnknown.collect()) myDebugger.TIMESTAMP('DEBUGGING DONE')
parsedData = raw_data.map(parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Time consumed = '), (datetime.now() - startTime) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) #save and load model model.save(sc, "DTR-Narrow-2008") sameModel = DecisionTreeModel.load(sc, "DTR-Narrow-2008") sc.stop()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
data = MLUtils.loadLibSVMFile(sc, dataPath) # 将数据集分割为训练数据集和测试数据集 (trainingData, testData) = data.randomSplit([0.7, 0.3]) print("train data count: " + str(trainingData.count())) print("test data count : " + str(testData.count())) # 训练决策树分类器 # categoricalFeaturesInfo 为空,表示所有的特征均为连续值 model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # 测试数据集上预测 predictions = model.predict(testData.map(lambda x: x.features)) # 打包真实值与预测值 labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) # 统计预测错误的样本的频率 testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) print('Decision Tree Test Error = %5.3f%%' % (testErr * 100)) print("Decision Tree Learned classifiction tree model : ") print(model.toDebugString()) # 保存和加载训练好的模型 modelPath = "/home/zhb/Desktop/work/DecisionTreeShareProject/app/myDecisionTreeClassificationModel" model.save(sc, modelPath) sameModel = DecisionTreeModel.load(sc, modelPath)
# $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeClassificationModel") sameModel = DecisionTreeModel.load( sc, "target/tmp/myDecisionTreeClassificationModel") # $example off$
from pyspark import SparkConf, SparkContext import urllib.request import urllib from pyspark.mllib.regression import LabeledPoint from numpy import array from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from time import time import createLabeledPoint from createLabeledPoint import * try: sc.stop() except: pass sc = SparkContext().getOrCreate(SparkConf()) testm = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_model") testm_portsweep = DecisionTreeModel.load(sc, "/home/ubuntu/project_src/probe_portsweep_model") test_data_file = "./corrected.gz" test_raw_data = sc.textFile(test_data_file) typename = test_raw_data.filter(lambda x: 'portsweep' in x) cur=0 idx=0 count = typename.count() for idx in range(count): typename_pd = typename.zipWithIndex().filter(lambda x: x[1]==idx).map(lambda x: x[0]) test_csv_data = typename_pd.map(lambda x: x.split(",")) test_data = test_csv_data.map(create_labeled_point) predictions = testm_portsweep.predict(test_data.map(lambda p: p.features)) if str(predictions.take(1)[0]) == "3.0": print(typename_pd.collect()) cur=cur+1
parsedData = raw_data.map(parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache() #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( test.count()) print('Time consumed = '), (datetime.now() - startTime) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) #save and load model model.save(sc, "DT-Class-N-00-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-N-00-08") sc.stop()
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Decision Tree Classification').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse the data file into an RDD of LabelPoint data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') #split the data into training and test sets (trainingData, testData) = data.randomSplit([0.7, 0.3]) #train a decision tree model model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # evaluate model on test instance and compute test error predictions = model.predict(testData.map(lambda x : x.features)) labelAndPredictions = testData.map(lambda lp : lp.label).zip(predictions) testErr = labelAndPredictions.filter(lambda (v, p) : v != p).count()/float(testData.count()) print('test err' + str(testErr)) print('learned classification tree model :' + str(model.toDebugString)) # save and load model model.save(sc, '../model/myDecisionTreeClassificationModel') sameModel = DecisionTreeModel.load(sc, '../model/myDecisionTreeClassificationModel') sc.stop()
label = clean_line_split[10] nonLable = clean_line_split[0:10] + clean_line_split[11:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Wide-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
# $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonDecisionTreeRegressionExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) # Save and load model model.save(sc, "target/tmp/myDecisionTreeRegressionModel") sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel") # $example off$
def selectNext(self): # get predictions from individual trees self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \ .leftOuterJoin(self.dataset.trainSet) \ .map(lambda _: (_[0], _[1][1])) # zipping actual indices with dummy indices so that they can be traced later actualIndices = self.trainDataUnknown.map(lambda _: _[0]) \ .zipWithIndex() \ .map(lambda _: (_[1], _[0])) # an empty RDD rdd = sc.parallelize([]) ''' these java objects are not serializable thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ''' for x in self.model._java_model.trees(): # zipping each prediction from each decision tree with individual sample index so that they can be added later predX = DecisionTreeModel(x) \ .predict(self.trainDataUnknown.map(lambda _: _[1].features)) \ .zipWithIndex() \ .map(lambda _: (_[1], _[0])) predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1]) rdd = rdd.union(predX) ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s''' sumScore = rdd.groupByKey().mapValues(sum) totalEstimators = self.nEstimators # average of the predicted scores f_1 = sumScore.map(lambda _: (_[0], _[1] / totalEstimators)) # standard deviation of predicted scores f_2 = sumScore.map(lambda _: getSD(_, totalEstimators)) # - proportion of positive points nLabeled = self.trainDataKnown.count() nUnlabeled = self.trainDataUnknown.count() proportionPositivePoints = (self.trainDataKnown.map( lambda _: _[1].label).reduce(lambda x, y: x + y)) / nLabeled f_3 = f_1.map(lambda _: proportionPositivePoints) # - estimate variance of forest by looking at avergae of variance of some predictions estimateVariance = ( f_2.map(lambda _: _[1]).reduce(lambda x, y: x + y)) / nUnlabeled f_6 = f_3.map(lambda _: estimateVariance) # - number of already labelled datapoints f_8 = f_3.map(lambda _: nLabeled) myDebugger.TIMESTAMP('features ready for transposing') # transposing start tempf_1 = f_1.map(lambda _: _[1]).zipWithIndex().map(lambda _: (_[1], _[0])) tempf_2 = f_2.map(lambda _: _[1]).zipWithIndex().map(lambda _: (_[1], _[0])) tempf_3 = f_3.zipWithIndex().map(lambda _: (_[1], _[0])) tempf_6 = f_6.zipWithIndex().map(lambda _: (_[1], _[0])) tempf_8 = f_8.zipWithIndex().map(lambda _: (_[1], _[0])) LALDataset = tempf_1\ .leftOuterJoin(tempf_2)\ .leftOuterJoin(tempf_3)\ .leftOuterJoin(tempf_6)\ .leftOuterJoin(tempf_8)\ .map(lambda _ : LabeledPoint(_[0] , [_[1][0][0][0][0], _[1][0][0][0][1], _[1][0][0][1], _[1][0][1], _[1][1]])) myDebugger.TIMESTAMP('transposing done') # # predict the expected reduction in the error by adding the point LALprediction = self.lalModel.predict(LALDataset.map(lambda _ : _.features))\ .zipWithIndex()\ .map(lambda _ : (_[1],_[0])) myDebugger.TIMESTAMP('prediction done') # Selecting the index which has the highest uncertainty/ closest to probability 0.5 selectedIndex1toN = LALprediction.sortBy(lambda _: _[1]).max()[0] # takes the selectedIndex from the unknown samples and add it to the known ones self.indicesKnown = self.indicesKnown.union( sc.parallelize([selectedIndex1toN])) # updating unknown indices self.indicesUnknown = self.indicesUnknown.filter( lambda _: _ != selectedIndex1toN) ''' debugging block ''' myDebugger.TIMESTAMP('update unknown indices') myDebugger.DEBUG(selectedIndex1toN) myDebugger.DEBUG(self.indicesKnown.collect()) myDebugger.DEBUG(self.indicesUnknown.collect()) myDebugger.TIMESTAMP('DEBUGGING DONE')
from pyspark.sql.session import SparkSession from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.ml import Pipeline from pyspark.mllib.tree import DecisionTreeModel sc = SparkContext() spark = SparkSession(sc) inputDF = spark.read.csv('s3://himaniproject2/ValidationDataset.csv', header='true', inferSchema='true', sep=';') transformed_df = inputDF.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) model = DecisionTreeModel.load(sc, "s3://himaniproject2/model") predictions = model.predict(transformed_df.map(lambda x: x.features)) labels_and_predictions = transformed_df.map(lambda x: x.label).zip(predictions) acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float( transformed_df.count()) print(".........................................................") print("Model accuracy....................: %.3f%%" % (acc * 100)) metrics = MulticlassMetrics(labels_and_predictions) fscore = metrics.fMeasure() print(".........................................................") print("F1 Score.................................. = %s" % fscore)
.setMaster(master) .setAppName(app_name)) sc = SparkContext(conf=conf) lines = sc.textFile(input) parsedData = lines.map(parseLine) (trainingData, testData) = parsedData.randomSplit([0.5, 0.5]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) predictions.foreach(my_print) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) labelsAndPredictions.foreach(my_print) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, output) sameModel = DecisionTreeModel.load(sc, output) sc.stop()
# Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile( sc, 'file:///usr/local/spark/data/mllib/sample_libsvm_data.txt' ) # The code on web is wrong, this is correct. # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float( testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model model.save(sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel') sameModel = DecisionTreeModel.load( sc, 'file:///home/hadoop/tmp/myDecisionTreeClassificationModel')
#Cancelled becomes the 9th column now, and total columns in the data = 9 label = clean_line_split[8] nonLable = clean_line_split[0:8] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) training.cache () #start timer at this point startTime = datetime.now() #build the model model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Error = ' + str (testErr)) print ('Learned classification tree model:') print (model.toDebugString()) #save and load model model.save(sc, "DT-Class-W-95-08") sameModel = DecisionTreeModel.load(sc, "DT-Class-W-95-08") sc.stop ()
print "######################################################\n" print "######################################################\n" print "######### Start!!! #######\n" print "######################################################\n" print "######################################################\n" print "\n\n\n" #stop_rdd = rdd_tweets.coalesce(1) #stop_rdd.saveAsTextFile(output_path) print "****************************************************\n" print "Here is the last step\n" print "****************************************************\n" #Here is the trainning steps. binladen_model = DecisionTreeModel.load(sc, binladen_model_path) # #training_data = MLUtils.loadLibSVMFile(sc, training_path) test_data = rdd_labelFeatures # Evaluate model on test instances and compute test error predictions = binladen_model.predict(test_data.map(lambda x: x.features)) # test the error value labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count()) tmp_str = 'Test Error = ' + str(testErr) print(tmp_str) log_write(tmp_str) print "\n\n" #featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\ # .zip(predictions)
nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Wide-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008") sc.stop ()
scheduled_departure_time=t[1].scheduled_departure_time, actual_departure_time=t[1].actual_departure_time, departure_delay_minutes=t[1].departure_delay_minutes, scheduled_arrival_time=t[1].scheduled_arrival_time, actual_arrival_time=t[1].actual_arrival_time, arrival_delay_minutes=t[1].arrival_delay_minutes, crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes, distance=t[1].distance) if __name__ == "__main__": sc = SparkContext(appName="InsightEdge Python API Demo: prediction job") ssc = StreamingContext(sc, 3) sqlc = SQLContext(sc) zkQuorum = "localhost:2181" topic = "flights" model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc)) carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc)) origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc)) destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc)) kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.foreachRDD(predict_and_save) ssc.start() ssc.awaitTermination()
nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Narrow-2008") sameModel = DecisionTreeModel.load (sc, "DTR-Narrow-2008") sc.stop ()
numClasses=2, categoricalFeaturesInfo={}, numTrees=n_estimators, featureSubsetStrategy="auto", impurity='gini') ''' accuracy test on testset here''' predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1]) n_unlabeled = unlabeled_data.count() rdd = sc.parallelize([]) for tree in model._java_model.trees(): predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\ .zipWithIndex()\ .map(lambda _: (_[1], _[0])) rdd = rdd.union(predX) classPrediction = rdd.groupByKey().mapValues(sum) classPrediction = classPrediction.sortByKey() entropies = classPrediction.map(lambda _: abs(0.5 - (1 - (_[1] / n_estimators)))) unlabeled_entropies = unlabeled_indices.map(lambda _: _[0])\ .zipWithIndex()\ .map(lambda _: (_[1], _[0]))\ .leftOuterJoin(entropies.zipWithIndex().map(lambda _:(_[1], _[0])))\ .map(lambda _:_[1]) sorted_unlabeled_entropies = unlabeled_entropies.sortBy(lambda _: _[1])
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest, \ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("TaxiWeb") sc = SparkContext(conf=conf) model = DecisionTreeModel.load(sc, "TugasAkhir/Model/decision_tree/decision_tree_v5")