def trainClassifier(self): # get the current time current = time() # get the tags tags = self.tags numeric = self.numeric x = self.x y = self.y # get the training data training_data = self.training_labeled # start training the tree model self.tree_model = DecisionTree.trainClassifier( training_data, numClasses=4, categoricalFeaturesInfo={0 : len(tags), 1 : len(numeric), 2 : len(x), 3 : len(y)}, impurity="gini", maxDepth=5, maxBins=1000) print self.tree_model # total time total = time() - current print "Classifier trained in {} seconds.".format(round(total, 3)) # start evaluating the model self.evaluate()
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42) # Train the models decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2) # Test the model testDecisionTree(decisionTreeModel, parallelized_test_set)
def decisionTree(trainingRDD, trainingRDDHashed, testRDDHashed, testRDD): # Get size of RDD nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Train the Decision Tree Model trainedModel = DecisionTree.trainClassifier( trainingRDD, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=2, maxBins=3) # Test the Model on the Training Set predictions = trainedModel.predict(trainingRDD.map(lambda x: x.features)) labelsAndPredictions = trainingRDD.map( lambda lp: lp.label).zip(predictions).countByValue() # Map to Dictionary for obtaining Results resultsValidation = defaultdict(lambda: 0, labelsAndPredictions) nFilesV = trainingRDDHashed.count() nFilesT = testRDDHashed.count() # Get F-Score and Accuracy Value AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV) # Test the Model on the Test Set predictions = trainedModel.predict(testRDD.map(lambda x: x.features)) labelsAndPredictions = testRDD.map( lambda lp: lp.label).zip(predictions).countByValue() # Map to Dictionary for obtaining Results resultsTest = defaultdict(lambda: 0, labelsAndPredictions) AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT) # Print Results print(' Results for Decision Tree') print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV) print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT) # Return the Result List return AccuracyV, fScoreV, AccuracyT, fScoreT
def generateDecisionTree(): if os.path.exists(DT_PATH): print("DT_PATH Already available") return global model data = sc.textFile(F_PATH).map(parseLine) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L) model = DecisionTree.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ', str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) modelStatistics(labelsAndPredictions) # Save and load model model.save(sc, DT_PATH) print("Decision Tree model saved!")
def train(self, num_classes=2, categorical_features=None, max_depth=5): categorical_features = categorical_features or {} model = DecisionTree.trainClassifier( self._labeled_feature_vector_rdd(), numClasses=num_classes, categoricalFeaturesInfo=categorical_features, maxDepth=max_depth) return DecisionTreeModel(model, self.feature_cols)
def trainModel(trainingData): print '\nTraining Decision Tree model started' Utils.logTime() model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5,maxBins=32) print '\nTraining Decision Tree model finished' Utils.logTime() return model
def RunDecisionTree(tf): rdd = tf.map(parseAsLabeledPoints) train, test = rdd.randomSplit([.8, .2]) model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=100) predictions = model.predict(train.map(lambda x: x.features)) labelsAndPredictions = train.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) print('Training Error = ' + str(trainErr))
def RunDecisionTree(tf): rdd = tf.map(parseAsLabeledPoints) train, test = rdd.randomSplit([.8, .2]) numCat = len(genCats) model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) # Evaluate model on training instances and compute training error predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) print('Accuracy of decision tree = ', 1-trainErr) print('Training Error = ' + str(trainErr))
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins): decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={}, impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions) return decisionTreeModel
def classify(sc, sample): def ff(x): newsample = [] nl = ["rainy","sad","lack"] ml = ["cloudy","soso","enough"] pl = ["sunny","happy","most"] for i in x: if i in nl: newsample.append(0) elif i in ml: newsample.append(1) elif i in pl: newsample.append(2) return newsample f = lambda x:1 if x=="yes" else 0 traindata = sc.parallelize(sample).map(lambda x:(ff(x[0]),f(x[1]))) traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0])) testdata = traindata.first() print testdata ###### # print "logistic" # lrModel = LogisticRegressionWithSGD.train(traindata, 10) # prediction = lrModel.predict(testdata.features) # print prediction ##### # print "svm" # svmModel = SVMWithSGD.train(traindata, 10) # prediction = svmModel.predict(testdata.features) # print prediction # # # #### # print "naive bayes" # nbModel = NaiveBayes.train(traindata) # prediction = nbModel.predict(testdata.features) # print prediction # # # #### print "decesion tree" detreeModel = DecisionTree.trainClassifier(traindata, 2, {}) prediction = detreeModel.predict(testdata.features) print prediction
def main(input_file): sc = pyspark.SparkContext(appName="DecisionTree") data = MLUtils.loadLabeledPoints(sc, input_file) trainingData, testData = data.randomSplit([0.70, 0.3]) # Cache in memory for faster training trainingData.cache() model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, maxDepth=16, maxBins=10) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) # print tree_model.toDebugString() print "" print "" print "Test Erros: {}".format(round(testErr,4))
def trainModel(self, vectSpace, path): try: if self.type == 'NaiveBayes': model = NaiveBayes.train(vectSpace) elif self.type == 'DecisionTree': model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5) if not os.path.exists(path): os.makedirs(path) else: shutil.rmtree(path) os.makedirs(path) model.save(self.sc, path) except: print "Unexpected error:", sys.exc_info()[0] raise return model
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile): filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect() clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect() clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec)) labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1])) trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3]) decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum, categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins) predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features)) trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions) predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features)) testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions) eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions) return decisionTreeModel
def create_model(name, training): if name == 'logistic': print_box() print "Logistic Regression Model" print_box() model = LogisticRegressionWithLBFGS.train(training) elif name == 'tree': print_box() print "Decision Tree Model" print_box() model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) elif name == 'rf': print_box() print "Random Forest Model" print_box() model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50) return model
def trainOptimalModel(trainingData, testData): print "\nTraining optimal Decision Tree model started!" Utils.logTime() impurityVals = ['gini', 'entropy'] maxDepthVals = [3,4,5,6,7] maxBinsVals = [8,16,32] optimalModel = None optimalMaxDepth = None optimalImpurity = None optimalBinsVal = None minError = None try: for curImpurity in impurityVals: for curMaxDepth in maxDepthVals: for curMaxBins in maxBinsVals: model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity=curImpurity, maxDepth=curMaxDepth, maxBins=curMaxBins) testErr, PR, ROC = Evaluation.evaluate(model, testData) if testErr < minError or not minError: minError = testErr optimalImpurity = curImpurity optimalMaxDepth = curMaxDepth optimalBinsVal = curMaxBins optimalModel = model except: msg = "\nException during model training with below parameters:" msg += "\timpurity: " + str(curImpurity) msg += "\tmaxDepth: " + str(curMaxDepth) msg += "\tmaxBins: " + str(curMaxBins) Utils.logMessage(msg) logMessage(optimalModel, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError) return optimalModel
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LogisticRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier(rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
# # print('\n== ACCURACY BAYES : ', accuracy_bayes , '==') # # file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n") # file.write('\n-> ACCURACY BAYES : ' + str(accuracy_bayes) + '\n') print("\n===================================================== ") print("=================== DECISION TREE =================== ") print("===================== (Entropy) ===================== ") print("=====================================================\n") print("\n=================== Training ================== \n") model_decision_tree_entropy = DecisionTree.trainClassifier( training, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=5, numClasses=2) print("Done : DT entropy training") print("\n=================== Testing =================== \n") #decision tree entropy predictions_decision_tree_enptropy = model_decision_tree_entropy.predict( test) num_pos_entropy = predictions_decision_tree_enptropy.countByValue()[1.0] num_neg_entropy = predictions_decision_tree_enptropy.countByValue()[0.0] #decision tree gini print("\n== PREDICTION ENTROPY : ==\n") print("- Positive : ", num_pos_entropy)
sc = SparkContext(appName="PythonDecisionTreeClassificationExample") # $example on$ # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'carbon2.txt') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={ 0: 5, 1: 5 }, impurity='entropy', maxDepth=5, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # Save and load model
# Load data. dataPath = 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features model = DecisionTree.trainClassifier( reindexedData, numClasses=numClasses, categoricalFeaturesInfo=categoricalFeaturesInfo) # Print learned tree and stats. print("Trained DecisionTree for classification:") print(" Model numNodes: %d" % model.numNodes()) print(" Model depth: %d" % model.depth()) print(" Training accuracy: %g" % getAccuracy(model, reindexedData)) if model.numNodes() < 20: print(model.toDebugString()) else: print(model) sc.stop()
pd.DataFrame(dfa.take(5), columns=dfa.columns).transpose() def labelData(data): return data.map(lambda row: LabeledPoint(row[9], [ row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[10], row[11], row[12], row[13], row[14], row[15] ])) trainData, testData = labelData(dfa).randomSplit([0.8, 0.2]) model = DecisionTree.trainClassifier(trainData, numClasses=3, maxDepth=10, categoricalFeaturesInfo={}, impurity='gini', maxBins=32) print model.toDebugString() def getPredictionLabels(model, testData): predictions = model.predict(testData.map(lambda r: r.features)) return predictions.zip(testData.map(lambda r: r.label)) def printMetrics(pred_and_label): metrics = MulticlassMetrics(pred_and_label) print 'Precision of 0', metrics.precision(0) print 'Precision of 1', metrics.precision(1)
# Split each line into a list based on the comma delimiters csvData = rawData.map(lambda x: x.split(",")) # Convert these lists to LabeledPoints trainingData = csvData.map(createLabeledPoints) # Create a test candidate, with 10 years of experience, currently employed, # 3 previous employers, a BS degree, but from a non-top-tier school where # he or she did not do an internship. You could of course load up a whole # huge RDD of test candidates from disk, too. testCandidates = [ array([10, 1, 3, 1, 0, 0])] testData = sc.parallelize(testCandidates) # Train our DecisionTree classifier using our data set model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, impurity='gini', maxDepth=5, maxBins=32) # Now get predictions for our unknown candidates. (Note, you could separate # the source data into a training set and a test set while tuning # parameters and measure accuracy as you go!) predictions = model.predict(testData) print ('Hire prediction:') results = predictions.collect() for result in results: print result # We can also print out the decision tree itself: print('Learned classification tree model:') print(model.toDebugString())
print >> f1, string.decode('utf8') elif algorithm == "DecisionTree": #DecisionTree 201612 gini 10 32 1 0 # numClasses = int(sys.argv[2]) impurity = str(sys.argv[3]) maxDepth = int(sys.argv[4]) maxBins = int(sys.argv[5]) minInstancesPerNode = int(sys.argv[6]) minInfoGain = float(sys.argv[7]) model = DecisionTree.trainClassifier(trainingData, numClasses=classNumber, categoricalFeaturesInfo={ 6: 3, 7: 3, 8: 3, 9: 5, 10: 5, 11: 5 }, impurity=impurity, maxDepth=maxDepth, maxBins=maxBins) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) with codecs.open('results.txt', "w", "utf-8") as f1: string = 'testErr:' + str(testErr)
def test_all(self, measure_columns=None, dimension_columns=None): if dimension_columns is None: dimensions = self._dimension_columns self._target_dimension = measure_columns[0] dimension = self._target_dimension max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [ dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels ] all_measures = [ x for x in self._measure_columns if x != self._target_dimension ] self.transform_data_frames() decision_tree_result = DecisionTreeResult() cat_feature_info = [len(self._mapping_dict[c]) for c in all_dimensions] if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) # print cat_feature_info if self._pandas_flag: dimension_classes = self._data_frame[dimension].nunique() self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures] x = self._data_frame.drop(dimension, axis=1) y = self._data_frame[dimension] for i in x.columns: x[i] = x[i].fillna(x[i].mode()[0]) model = DecisionTreeRegressor(max_depth=6) model = model.fit(x, y) output_result = self.tree_to_code(model, list(x.columns)) output_result = list(map(lambda x: x.strip(), output_result)) print(output_result, "output_result") else: dimension_classes = self._data_frame.select( dimension).distinct().count() self._data_frame = self._data_frame[[dimension] + all_dimensions + all_measures] data = self._data_frame.rdd.map( lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=6, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame, self._pandas_flag) self._new_tree = self.generate_new_tree(decision_tree) node_list = self.node_name_extractor(self._new_tree) node_list = list(self.flatten(node_list)) correct_count_list = [i[0] for i in self._count_list] tree_dict = dict(list(zip(node_list, correct_count_list))) #self._new_tree = self.generate_new_tree_total(decision_tree) self._new_tree = self.wrap_tree(self._new_tree, tree_dict) self._path_dict = self.path_dict_creator(node_list, self._new_tree) # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree) # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability, self._path_dict) decision_tree_result.set_target_map( self._mapping_dict[self._target_dimension], self._aggr_data, self._important_vars) # self._completionStatus += self._scriptWeightDict[self._analysisName]["script"]*self._scriptStages["dtreeTrainingStart"]["weight"]/10 # progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ # "dtreeTrainingEnd",\ # "info",\ # self._scriptStages["dtreeTrainingEnd"]["summary"],\ # self._completionStatus,\ # self._completionStatus) # CommonUtils.save_progress_message(self._messageURL,progressMessage) # self._dataframe_context.update_completion_status(self._completionStatus) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._analysisName, "dtreeTrainingEnd", "info", weightKey="script") # print decision_tree_result return decision_tree_result
def train_validate_test_rpart(): try: plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",") anom = pat_proc.filter(pat_proc.is_anomalous == 1) benign = pat_proc.filter(pat_proc.is_anomalous == 0) n_benign = benign.count() print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count())) #anom.count() = 49542, benign.count() = 197406 sample_from_benign = benign.sample(False, 50000/n_benign) pat_proc = anom.unionAll(sample_from_benign) print("pat_proc.count() = " + str(pat_proc.count())) #99,227 all_columns = pat_proc.columns features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])] categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway procedure_features = [x for x in features if (x not in categorical_features)] #Construct the map categoricalFeaturesInfo, which specifies which features are categorical and how many categorical values each of those features can take. #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries: #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature. cat_feature_number = 0 dict_cat_features = {} for feature in categorical_features: agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect() #collect() is an action that returns all the elements of the dataset as an array at the driver program. #Calls to collect() imply there would be communication between the executors and the driver, so use it with discretion. distinct_values = map(lambda row: row.asDict().values()[0], agvalues) distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values)) dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values] cat_feature_number += 1 pat_proc = pat_proc.rdd print("pat_proc.getNumPartitions() = " + str(pat_proc.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8 (train, test) = pat_proc.randomSplit([0.5, 0.5]) test_data_size = test.count() print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size)) training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) print("training_data.count() = " + str(training_data.count())) #Populate the actual categoricalFeaturesInfo dictionary cat_features_info = dict([(value[0], value[1]) for (key, value) in dict_cat_features.iteritems()]) procedure_features_info = dict([(feature_id, 2) for feature_id in range(3, 2 + len(procedure_features))]) cat_features_info = dict(cat_features_info.items() + procedure_features_info.items()) t0 = time() model = DecisionTree.trainClassifier(training_data, numClasses = 2, categoricalFeaturesInfo = cat_features_info, impurity = 'gini', maxDepth = 2, maxBins = 32) #Under the hood in DecisionTree.scala, RandomForest is called with numTrees = 1 and featureSubsetStrategy = "all". tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) #63.355 seconds (5.5 times compared to standalone R). Even when maxDepth was reduced from 5 to 2, time to train was 61.942 seconds. print(model) test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) t0 = time() predictions = model.predict(test_data.map(lambda p: p.features)) tt = time() - t0 print "Prediction made in {} seconds".format(round(tt,3)) #0.014 seconds labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) #Create a list of tuples with each tuple having the actual and the predicted label test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_size) fpr = labels_and_preds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labels_and_preds.filter(lambda (v, p): v == 0).count() fnr = labels_and_preds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labels_and_preds.filter(lambda (v, p): v == 1).count() print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #With maxDepth = 5, test accuracy is 0.9084, fpr is 0.1555, fnr is 0.0272. #With maxDepth = 2, test accuracy is 0.861, fpr is 0.2591, fnr is 0.018 print model.toDebugString() except Exception: print("Exception in user code:") traceback.print_exc(file = sys.stdout) return model
header = train_rawDataWithHeader.first() rawData = train_rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) train_data = rData.map(lambda x: x.split(",")) #test_data=prepare_data(test) header = test_rawDataWithHeader.first() rawData = test_rawDataWithHeader.filter(lambda x: x != header) rData = rawData.map(lambda x: x.replace("\"", "")) test_data = rData.map(lambda x: x.split(",")) lines=train_data+test_data categoriesMap = lines.map(lambda fields: fields[0]).distinct().zipWithIndex().collectAsMap() train_RDD=train_data.map(lambda r:LabeledPoint(extract_label(r),extract_features(r,categoriesMap,len(r)-1))) test_RDD=test_data.map(lambda r: (extract_features(r,categoriesMap,len(r)-1),r[-1])) model=DecisionTree.trainClassifier(train_RDD,numClasses=2,categoricalFeaturesInfo={},impurity='entropy',maxDepth=14,maxBins=9) count = 0 num = 0 positive = 0 negative = 0 truePositive = 0 trueNegative =0 falsePositive = 0 falseNegative = 0 for data in test_RDD.take(test_data.count()): num+=1 preds = int(model.predict(data[0])) #print(str(preds)+' '+str(data[1])) if(preds == int(data[1])):count=count+1 if int(data[1]) == 0: negative += 1
# 查看資料前處理結果 # print(labelpointRDD.first()) # 以randomSplit隨機方式,依照3:1 (75%:25%) 比例,將資料分為train set與test set (trainData, testData) = labelpointRDD.randomSplit([3, 1]) # print(testData.count()) # 為加快程式的執行效率,將train set與test set暫存在記憶體中 trainData.persist() testData.persist() # tune參數 params = [5, 10, 15, 20] for i in params: # 使用Spark MLlib支援的決策樹 model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={}, impurity="entropy", maxDepth=i, maxBins=15) # 使用model.predict對testDat作預測 score = model.predict(testData.map(lambda p: p.features)) # 印出預測的結果 # score.foreach(print) print(score.collect()) # all # print(score.take(2)) # first two # print(score.count()) # 有幾筆 # 將預測結果與真實label結合起來 scoreAndLabels = score.zip(testData.map(lambda p: p.label)) # 使用MulticlassMetrics做出confusionMatrix,計算Accuracy,Recall,Precision metrics = MulticlassMetrics(scoreAndLabels) print(metrics.confusionMatrix()) print("Accuracy = %s" % metrics.accuracy) print("Recall = %s" % metrics.recall(0)) print("Precision = %s" % metrics.precision(0))
LR_model = LogisticRegressionWithLBFGS.train(trained_hashed) LR_prediction_and_labels = check_hashed.map(lambda point: (LR_model.predict(point.features), point.label)) LR_correct = LR_prediction_and_labels.filter(lambda predicted, actual: predicted == actual) LR_accuracy = LR_correct.count() / float(check_hashed.count()) print ("LR training accuracy:" + str(LR_accuracy * 100) + " %") LR_output_dir = 'hdfs://master:9000/user/hadoop/LogisticRegression' shutil.rmtree("hdfs://master:9000/user/hadoop/LogisticRegression/metadata", ignore_errors=True) LR_model.save(cc, LR_output_dir) SVM_model = SVMWithSGD.train(trained_hashed, iterations=10) SVM_prediction_and_labels = check_hashed.map(lambda point: (SVM_model.predict(point.features), point.label)) SVM_model.clearThreshold() SVM_correct = SVM_prediction_and_labels.filter(lambda predicted, actual: predicted == actual) SVM_accuracy = SVM_correct.count() / float(check_hashed.count()) print ("SVM training accuracy:" + str(SVM_accuracy * 100) + " %") SVM_output = 'hdfs://master:9000/user/hadoop/SVM' shutil.rmtree("hdfs://master:9000/user/hadoop/SVM/metadata", ignore_errors=True) SVM_model.save(cc, SVM_output) model = DecisionTree.trainClassifier(trained_hashed, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) predictions = model.predict(check_hashed.map(lambda x: x.features)) labelsAndPredictions = check_hashed.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter( lambda lp: lp[0] != lp[1]).count() / float(check_hashed.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) model.save(cc, "hdfs:///user/hadoop/DT")
from numpy import array from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark import SparkContext sc = SparkContext.getOrCreate() data = [ LabeledPoint(0.0, [0.0]), LabeledPoint(1.0, [1.0]), LabeledPoint(1.0, [2.0]), LabeledPoint(1.0, [3.0]) ] model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {}) print(model)
attack = 0.0 if len(line_split) >= 9 and line_split[9] == 'title': attack = 1.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) # Build the model t0 = time() tree_model = DecisionTree.trainClassifier( training_data, numClasses=2, categoricalFeaturesInfo={0: len(protocols)}, impurity='gini', maxDepth=4, maxBins=100) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt, 3)) predictions = tree_model.predict(test_data.map(lambda p: p.features)) labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float( test_data.count()) tt = time() - t0
# Create a test candidate, with 10 years of experience, currently employed, # 3 previous employers, a BS degree, but from a non-top-tier school where # he or she did not do an internship. You could of course load up a whole # huge RDD of test candidates from disk, too. testCandidates = [array([10, 1, 3, 1, 0, 0])] #Create RDD: testData = sc.parallelize(testCandidates) # Train our DecisionTree classifier using our data set model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={ 1: 2, 3: 4, 4: 2, 5: 2 }, impurity='gini', maxDepth=5, maxBins=32) # Now get predictions for our unknown candidates. (Note, you could separate # the source data into a training set and a test set while tuning # parameters and measure accuracy as you go!) predictions = model.predict(testData) print('Hire prediction:') #Uptil this point Spark doesn't do anything, just setting a Spark format up. results = predictions.collect() for result in results:
from pyspark import SparkContext from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils import json from bson import json_util from bson.json_util import dumps if __name__ == "__main__": sc = SparkContext(appName="DecisionTreeClassification") raw_data = MLUtils.loadLibSVMFile(sc, '/home/hechem/spark-campaign-classification/test/data/sample_libsvm_data.txt') (trainingDataSet, testDataSet) = raw_data.randomSplit([0.7, 0.3]) tree = DecisionTree.trainClassifier(trainingDataSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=30) predictions = tree.predict(testDataSet.map(lambda x: x.features)) labelsAndPredictions = testDataSet.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testDataSet.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(tree.toDebugString()) tree_to_json = tree.toDebugString() # Parser def parse(lines): block = [] while lines : if lines[0].startswith('If'):
sc = SparkContext() result = {1.0: 'yes', 0.0: 'no'} # 机器学习实战第三章中的鱼类归属数据源 data = [ LabeledPoint(1, [1, 1]), LabeledPoint(1, [1, 1]), LabeledPoint(0, [1, 0]), LabeledPoint(0, [0, 1]), LabeledPoint(0, [0, 1]) ] rdd = sc.parallelize(data) print '------------------------------------' print type(rdd), dir(rdd) print rdd.collect() print '------------------------------------' model = DecisionTree.trainClassifier(rdd, 3, {}) # print(model) print '********************************************************' print(model.toDebugString()) print "test [1,0]: %s" % (result[model.predict(array([1, 0]))]) print "test [1,1]: %s" % (result[model.predict(array([1, 1]))]) print "test [0,0]: %s" % (result[model.predict(array([0, 0]))]) print '********************************************************' sc.stop()
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns dimension = dimension_columns[0] all_dimensions = self._dimension_columns all_measures = self._measure_columns cat_feature_info = [] columns_without_dimension = list(x for x in all_dimensions if x != dimension) mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() for column in all_dimensions: mapping_dict[column] = dict( enumerate( self._data_frame.select(column).distinct().rdd.map( lambda x: str(x[0])).collect())) # for c in mapping_dict: # name = c # reverseMap = {v: k for k, v in mapping_dict[c].iteritems()} # udf = UserDefinedFunction(lambda x: reverseMap[x], StringType()) # self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns]) # converting spark dataframe to pandas for transformation and then back to spark dataframe pandasDataFrame = self._data_frame.toPandas() for key in mapping_dict: pandasDataFrame[key] = pandasDataFrame[key].apply( lambda x: 'None' if x == None else x) reverseMap = {v: k for k, v in mapping_dict[key].items()} pandasDataFrame[key] = pandasDataFrame[key].apply( lambda x: reverseMap[x]) # sqlCtx = SQLContext(self._spark) self._data_frame = self._spark.createDataFrame(pandasDataFrame) self._mapping_dict = mapping_dict for c in columns_without_dimension: cat_feature_info.append( self._data_frame.select(c).distinct().count()) if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) dimension_classes = self._data_frame.select( dimension).distinct().count() self._data_frame = self._data_frame[[dimension] + columns_without_dimension + all_measures] data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=3, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame) self.generate_probabilities(decision_tree, dimension) # self._new_tree = utils.recursiveRemoveNullNodes(self._new_tree) # decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) decision_tree_result.set_params(decision_tree, self._new_rules, self._total, self._success, self._probability) return decision_tree_result
# Append Labels appendColumn(ensemble_test, rf_test_predict_label) appendColumn(ensemble_train, rf_train_predict_label) # Decision Trees # C13 - C21 # Build the Model max_depth = [5, 10, 15, 20] for i in range(0, len(max_depth), 1): m_depth = max_depth[i] # Build the Model model = DecisionTree.trainClassifier(train_data, 10, {}, impurity='gini', maxDepth=m_depth) rf_train_predict_label = [] rf_test_predict_label = [] # Predict Labels for j in range(0, len(test_features), 1): p_l = model.predict(test_features[j]) rf_test_predict_label.extend([p_l]) for j in range(0, len(train_features), 1): p_l = model.predict(train_features[j]) rf_train_predict_label.extend([p_l]) # Append Labels
if fields[6] == "Y": hired = 1 else: hired = 0 return LabeledPoint(hired, [years_of_exp,employed,previousEmployers,education_level,top_tier_school,internship]) path = '/home/sejal/Documents/datascience/dataset/data/emp/candidates_hired_past.csv' r1 = sc.textFile(path) r2 = r1.map(lambda entry: entry.split(',')) training_data = r2.map(prepare_data_for_DT) test_data = [10,1,2,2,1,0] model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1:2, 3:4,4:2,5:2}) predictions = model.predict(test_data) print("Hire OR No-Hire") print (predictions) print (model.toDebugString()) # results = predictions.collect() # for result in results: # print result
#exec(open("./doweathclass_dectree.py").read()) # ---------------- now try decision tree ------------ from pyspark.mllib.tree import DecisionTree dt_model = DecisionTree.trainClassifier(datax_rdd, 2, {}, impurity='entropy', maxDepth=3, maxBins=32, minInstancesPerNode=2) #maxDepth and maxBins #{} could be categorical feature list, # To do regression, have no numclasses,and use trainRegression function print(dt_model.toDebugString()) #results in this: #DecisionTreeModel classifier of depth 3 with 9 nodes # If (feature 1 <= 0.0) # If (feature 4 <= 80.0) # If (feature 3 <= 68.0) # Predict: 0.0 # Else (feature 3 > 68.0) # Predict: 1.0 # Else (feature 4 > 80.0) # If (feature 0 <= 0.0) # Predict: 0.0 # Else (feature 0 > 0.0) # Predict: 0.0 # Else (feature 1 > 0.0) # Predict: 1.0
def main(): spark = SparkSession\ .builder\ .appName("PythonSQL")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() #Import a messed up DataFrame (DF) to get Column info raw_df= sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferchema='true') \ .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz") features = raw_df.columns #Manually specify the correct datatypes for each Column and import new DF fields = [ StructField(field_name, FloatType(), True) for field_name in features ] fields[0].dataType = IntegerType() fields[-1].dataType = IntegerType() customSchema = StructType(fields) df = sqlContext.read.format('com.databricks.spark.csv').options(header='true') \ .load("/data/ganesh/BigData/Bosch/Source/train_numeric.csv.gz",schema = customSchema) df.na.fill(NaN) #Prepare feature for computation! #Remove features from a list precompiled on correlation criterion! counter = 0 with open('column_refine_list.csv', 'r') as f: csvlist = csv.reader(f, delimiter=',') for item in csvlist: column_to_go = item[:] print("Total numer of features to be removed: %d" % (len(column_to_go))) print("\n") for item in column_to_go: df = df.drop(item) print("Final number of features is: %d" % (len(df.columns[1:-1]))) #Decision Tree model Training training_points = labelData(df) #csv_numeric.map(ReducedlabelData) training_data, test_data = training_points.randomSplit([0.7, 0.3]) print(training_data.first()) t0 = time() tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=04, maxBins=100) tt = time() - t0 tree_model.save( sc, "DTmodel_model_reduced_222red") #Save the model for future use! print("Model trained in : %.4f Sec" % (tt)) print(tree_model.toDebugString()) #Making predictions on the test set ## Predict t0 = time() labels_and_preds = getLabelsPredictions(tree_model, test_data) test_accuracy = 100 * labels_and_preds.filter( lambda (v, p): v == p).count() / float(test_data.count()) print("Test accuracy is : %.4f " % (test_accuracy)) printMCC(labels_and_preds) tt = time() - t0 print("Predictions and metrics computed in : %.4f Sec" % (tt)) features_new = raw_df.columns #Manually specify the correct datatypes for each Column and import new DF fields = [ StructField(field_name, FloatType(), True) for field_name in features_new ] fields[0].dataType = IntegerType() fields[-1].dataType = IntegerType() customSchema = StructType(fields) df.select( df.columns ).write.format('com.databricks.spark.csv').options(header='true').save( '/data/ganesh/BigData/Bosch/Source/feature_reduction/numeric/feature_reduction_V2/df_739.csv', schema=customSchema) sc.stop()
7: len(destination_mapping.value) } splits = text_rdd.randomSplit([0.7, 0.3]) (training_rdd, test_rdd) = (splits[0], splits[1]) training_data = training_rdd.map( Utils.parse_flight).map(lambda rdd: Utils.create_labeled_point( rdd, carrier_mapping.value, origin_mapping.value, destination_mapping.value)) classes_count = 2 impurity = "gini" max_depth = 9 max_bins = 7000 model = DecisionTree.trainClassifier(training_data, classes_count, categorical_features_info, impurity, max_depth, max_bins) Utils.save_model_to_grid(model, "DecisionTreeFlightModel", sc) save_mapping(carrier_mapping.value, "CarrierMap", sqlc) save_mapping(origin_mapping.value, "OriginMap", sqlc) save_mapping(destination_mapping.value, "DestinationMap", sqlc) # Test model test_data = test_rdd.map(lambda r: Utils.parse_flight(r)) \ .map(lambda rdd: Utils.create_labeled_point(rdd, carrier_mapping.value, origin_mapping.value, destination_mapping.value)) predictions = model.predict(test_data.map(lambda x: x.features)) labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
trainingData_rdd.cache() ############################################################################### if 'Tree_mllib' in model_list: t0 = datetime.datetime.now() # Only change parameters here tree_mllib_par_dict = {} tree_mllib_par_dict['numClasses'] = 2 tree_mllib_par_dict['impurity'] = 'gini' tree_mllib_par_dict['maxDepth'] = 10 model_mllib_tree = DecisionTree.trainClassifier( trainingData_rdd, numClasses=tree_mllib_par_dict['numClasses'], categoricalFeaturesInfo={}, impurity=tree_mllib_par_dict['impurity'], maxDepth=tree_mllib_par_dict['maxDepth']) mllib_model_accuracy('Tree', model_mllib_tree, trainingData_rdd, testData_rdd) modelparameter_dict['Tree_mllib'] = tree_mllib_par_dict runtime_write('Tree_mllib', t0) ############################################################################### # Random forest if 'RF_mllib' in model_list: t0 = datetime.datetime.now()
clean_line_split[3] = len(flags) # convert label to binary label attack = 1.0 if len(line_split) >= 42 and line_split[41]=='normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) # Build the model t0 = time() tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)}, impurity='gini', maxDepth=4, maxBins=100) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) predictions = tree_model.predict(test_data.map(lambda p: p.features)) labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)) print "Learned classification tree model:"
sc = SparkContext.getOrCreate() data = MLUtils.loadLibSVMFile(sc, 'data/dataLibSVM.txt') print(data) # NEXT LET'S CREATE THE APPROPRIATE TRAINING AND TEST SETS # WE'LL BE SETTING THEM AS 70-30, ALONG WITH SETTING A # RANDOM SEED GENERATOR TO MAKE MY RESULTS REPRODUCIBLE (trainingSet, testSet) = data.randomSplit([0.7, 0.3], seed = 7) ################## # DECISION TREES # ################## fitDT = DecisionTree.trainClassifier(trainingSet, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=3, maxBins=32) print(fitDT.toDebugString()) predictionsDT = fitDT.predict(testSet.map(lambda x: x.features)) labelsAndPredictionsDT = testSet.map(lambda lp: lp.label).zip(predictionsDT) # Test Error Rate Evaluations testErrDT = labelsAndPredictionsDT.filter(lambda (v, p): v != p).count() / float(testSet.count()) print('Test Error = {0}'.format(testErrDT))
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
labelsAndPreds = parsedData.map(lambda p: (p.label, SVMmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) ## 0.555395278766 ############################ Decision TREE ############################## from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import DecisionTree from pyspark.mllib.util import MLUtils def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[-1], values[0:9]) data = sc.textFile("/Users/mac/Desktop/USF/MSAnalytics/Spring1/ML2/ML Project/plays.csv") header = data.first() data = data.filter(lambda x: x != header) parsedData = data.map(parsePoint) model = DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=30, maxBins=100) # Evaluate model on training instances and compute training error predictions = model.predict(parsedData.map(lambda x: x.features)) labelsAndPredictions = parsedData.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(parsedData.count()) #0.09 print('Training Error = ' + str(trainErr)) print('Learned classification tree model:') print(model)
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns if measure_columns is None: measures = self._measure_columns self._target_dimension = dimension_columns[0] dimension = self._target_dimension #####Look into it for Issue 947################# max_num_levels = GLOBALSETTINGS.DTREE_OTHER_DIMENSION_MAX_LEVEL # max_num_levels = min(max_num_levels, round(self._dataframe_helper.get_num_rows()**0.5)) # all_dimensions = [dim for dim in self._dimension_columns if self._dataframe_helper.get_num_unique_values(dim) <= max_num_levels] all_dimensions = [ dim for dim in self._dimension_columns if self._metaParser.get_num_unique_values(dim) <= max_num_levels ] all_measures = self._measure_columns if self._pandas_flag: self._data_frame = self._data_frame[all_dimensions + all_measures] cat_feature_info = [] columns_without_dimension = [ x for x in all_dimensions if x != dimension ] mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() decision_tree_result.set_freq_distribution( self._metaParser.get_unique_level_dict(self._target_dimension), self._important_vars) if self._pandas_flag: try: all_dimensions.remove(dimension) except: pass actual_cols = list(self._data_frame.columns) print(actual_cols) self._data_frame = pd.get_dummies(self._data_frame, columns=all_dimensions) after_dummy_cols = list(self._data_frame.columns) def Diff(li1, li2): return (list( list(set(li1) - set(li2)) + list(set(li2) - set(li1)))) decision_tree_result.dummy_cols = [ Diff(after_dummy_cols, Diff(actual_cols, all_dimensions)), all_dimensions ] all_dimensions.append(dimension) #this has been done for scoring error if self._pandas_flag: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, [dimension], self._pandas_flag) else: self._data_frame, mapping_dict = MLUtils.add_string_index( self._data_frame, all_dimensions, self._pandas_flag) if self._pandas_flag: print(self._data_frame.head(1)) else: print(self._data_frame.show(1)) # standard_measure_index = {0.0:'Low',1.0:'Medium',2.0:'High'} standard_measure_index = { 0.0: 'Low', 1.0: 'Below Average', 2.0: 'Average', 3.0: 'Above Average', 4.0: 'High' } for measure in all_measures: mapping_dict[measure] = standard_measure_index for k, v in list(mapping_dict.items()): temp = {} for k1, v1 in list(v.items()): self._alias_dict[v1.replace(",", "")] = v1 temp[k1] = v1.replace(",", "") mapping_dict[k] = temp self._mapping_dict = mapping_dict if not self._pandas_flag: for c in columns_without_dimension: if self._pandas_flag: cat_feature_info.append(len(self._data_frame[c].unique())) else: cat_feature_info.append( self._data_frame.select(c).distinct().count()) for c in all_measures: cat_feature_info.append(5) columns_without_dimension = columns_without_dimension + all_measures all_measures = [] if len(cat_feature_info) > 0: max_length = max(cat_feature_info) else: max_length = 32 else: decision_tree_result.mappingdict = mapping_dict[dimension] max_length = 32 cat_feature_info = dict(enumerate(cat_feature_info)) if self._pandas_flag: dimension_classes = len(self._data_frame[dimension].unique()) else: dimension_classes = self._data_frame.select( dimension).distinct().count() if not self._pandas_flag: self._data_frame = self._data_frame[[dimension] + columns_without_dimension + all_measures] print("=" * 200) # print self._data_frame.rdd.first() print("numClasses", dimension_classes) print("maxDepth", self._maxDepth) decision_tree_result._maxDepth = self._maxDepth print("maxBins", max_length) print("=" * 200) if self._pandas_flag: self._data_frame.columns = [ re.sub('\W+', '_', col.strip()) for col in self._data_frame.columns ] x = self._data_frame.drop(dimension, axis=1) y = self._data_frame[dimension] #tle = LabelEncoder() #y = tle.fit_transform(y) for i in x.columns: x[i] = x[i].fillna(x[i].mode()[0]) model = DecisionTreeClassifier(criterion='gini', max_depth=self._maxDepth, random_state=42) model = model.fit(x, y) output_result = self.tree_to_code(model, list(x.columns)) output_result = list(map(lambda x: x.strip(), output_result)) else: data = self._data_frame.rdd.map( lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column # model = DecisionTree.trainClassifier(trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) # Removed categoricalFeaturesInfo to be passed to DecisionTree to get all levels and consider all feature as continuous variables #But that results in wrong result in Prediction Rule eg: columns containing "yes" or "no" as its value is considered as float value(0.5) so removing categoricalFeaturesInfo={} with categoricalFeaturesInfo=cat_feature_info model = DecisionTree.trainClassifier( trainingData, numClasses=dimension_classes, categoricalFeaturesInfo=cat_feature_info, impurity='gini', maxDepth=self._maxDepth, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame, self._pandas_flag) self._new_tree = self.generate_new_tree(decision_tree) node_list = self.node_name_extractor(self._new_tree) node_list = list(self.flatten(node_list)) correct_count_list = [i[0] for i in self._count_list] tree_dict = dict(list(zip(node_list, correct_count_list))) self._new_tree = self.wrap_tree(self._new_tree, tree_dict) self._path_dict = self.path_dict_creator(node_list, self._new_tree) print("===" * 40) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability, self._path_dict) self._completionStatus += old_div( self._scriptWeightDict[self._analysisName]["script"] * self._scriptStages["treegeneration"]["weight"], 10) progressMessage = CommonUtils.create_progress_message_object(self._analysisName,\ "treegeneration",\ "info",\ self._scriptStages["treegeneration"]["summary"],\ self._completionStatus,\ self._completionStatus) CommonUtils.save_progress_message(self._messageURL, progressMessage, ignore=self._ignoreMsg) self._dataframe_context.update_completion_status( self._completionStatus) return decision_tree_result
print(test_data[0].features) print(dir(test_data[0])) new_test_data = [] for l_point in test_data: new_test_data.append(l_point.features) test = sc.parallelize(new_test_data) start_time = time.process_time() print("Creating tree") #dt = SparkDecisionTreeClassifier(featuresCol = 'features', # labelCol = 'Target', # maxMemoryInMB=2048, # minInstancesPerNode=2) print("Fitting") dtModel = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={}, minInstancesPerNode=2) #dtModel = dt.fit(train) print("Predicting") predictions = dtModel.predict(test).collect() #predictions = dtModel.transform(test) print("Showing Predictions") #predictions.printSchema() #predictions.select('rawPrediction', 'prediction', 'probability').show(10) print(predictions) end_time = time.process_time() delta_time = end_time - start_time print(delta_time / 60.0)
sc = SparkContext(appName="PythonDT") # Load data. dataPath = 'data/mllib/sample_libsvm_data.txt' if len(sys.argv) == 2: dataPath = sys.argv[1] if not os.path.isfile(dataPath): sc.stop() usage() points = MLUtils.loadLibSVMFile(sc, dataPath) # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, categoricalFeaturesInfo=categoricalFeaturesInfo) # Print learned tree and stats. print("Trained DecisionTree for classification:") print(" Model numNodes: %d" % model.numNodes()) print(" Model depth: %d" % model.depth()) print(" Training accuracy: %g" % getAccuracy(model, reindexedData)) if model.numNodes() < 20: print(model.toDebugString()) else: print(model) sc.stop()
# elif line_split[41]=='rootkit.': # attack = 19.0 # elif line_split[41]=='perl.': # attack = 20.0 # elif line_split[41]=='loadmodule.': # attack = 21.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) t0 = time() print("Classifier training started at: ".format(round(t0, 3))) tree_model = DecisionTree.trainClassifier(training_data, numClasses=5, categoricalFeaturesInfo={ 1: len(protocols), 2: len(services), 3: len(flags) }, impurity='gini', maxDepth=4, maxBins=100) tree_model.save(sc, "/home/ubuntu/project_src/probe_portsweep_model") tt = time() - t0 print("Classifier trained in {} seconds".format(round(tt, 3))) predictions = tree_model.predict(test_data.map(lambda p: p.features)) labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) t0 = time() test_accuracy = labels_and_preds.filter( lambda vp: vp[0] == vp[1]).count() / float(test_data.count()) tt = time() - t0 #print(labels_and_preds.collect()) print(
from sklearn.cross_validation import LeaveOneOut from sklearn.cross_validation import KFold # Kfold if __name__ == "__main__": sc = SparkContext('local',appName="Prediction") import fileinput data_y1, data_y2 = [], [] for line in fileinput.input("data/feature_extracted_class3.txt"): data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]])) data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]])) total, right, mse = 0, 0, [] for t in xrange(10): kf = KFold(32*40, n_folds=10) for train, test in kf: data_train_y1, data_train_y2 = [], [] for i in train: data_train_y1.append(data_y1[i]) data_train_y2.append(data_y2[i]) clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100) for i in test: data_test_y1, data_test_y2 = data_y1[i], data_y2[i] r1 = clf1.predict(data_test_y1.features) r2 = clf2.predict(data_test_y2.features) if r1 == data_test_y1.label: right += 1 mse.append(abs(r2-data_test_y2.label)) total += 1 print float(right)/total, sum(mse)/len(mse)
conf = SparkConf().setAppName(appName).setMaster("local[2]") #at least 2 sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # separate the classification label and the actual data def parsePoint(line): values = [float(x) for x in line.split(',')] return LabeledPoint(values[0], values[1:]) # training the model data = sc.textFile(learning_data_file) parsedData = data.map(parsePoint) model = (DecisionTree.trainClassifier(parsedData, numClasses=2, categoricalFeaturesInfo={2:9}, impurity='gini', maxDepth=30)) """ model = (RandomForest.trainClassifier(parsedData, numClassesForClassification=2, numTrees=6, categoricalFeaturesInfo={2:10}, impurity='gini', maxDepth=30)) """ print "====================== model trained ======================" # streaming and parsing text lines = ssc.socketTextStream(HOST, QUERY_PORT) vectors = lines.flatMap(lambda x:x.split(',')).map(lambda l:float(l))
""" values = [float(s) for s in line.strip().split(',')] if values[0] == -1: # Convert -1 labels to 0 for MLlib values[0] = 0 elif values[0] > 0: values[0] = 1 return LabeledPoint(values[0], values[1:]) parsed_data = points.map(parsePoint) print 'After parsing, number of training lines: %s' % parsed_data.count() parsed_data.take(5) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = parsed_data.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=3, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # save the model to model.save(sc, "decisiontree")
def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\ RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), LabeledPoint(0.0, [2, 0, 0]), LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] temp_dir = tempfile.mkdtemp() lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) svm_model = SVMWithSGD.train(rdd, iterations=10) self.assertTrue(svm_model.predict(features[0]) <= 0) self.assertTrue(svm_model.predict(features[1]) > 0) self.assertTrue(svm_model.predict(features[2]) <= 0) self.assertTrue(svm_model.predict(features[3]) > 0) nb_model = NaiveBayes.train(rdd) self.assertTrue(nb_model.predict(features[0]) <= 0) self.assertTrue(nb_model.predict(features[1]) > 0) self.assertTrue(nb_model.predict(features[2]) <= 0) self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories dt_model = DecisionTree.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) dt_model_dir = os.path.join(temp_dir, "dt") dt_model.save(self.sc, dt_model_dir) same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir) self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString()) rf_model = RandomForest.trainClassifier( rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) rf_model_dir = os.path.join(temp_dir, "rf") rf_model.save(self.sc, rf_model_dir) same_rf_model = RandomForestModel.load(self.sc, rf_model_dir) self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString()) gbt_model = GradientBoostedTrees.trainClassifier( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) gbt_model_dir = os.path.join(temp_dir, "gbt") gbt_model.save(self.sc, gbt_model_dir) same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir) self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString()) try: rmtree(temp_dir) except OSError: pass
# read as data frame data = spark.read.format("com.mongodb.spark.sql.DefaultSource").load() # split into training set and test set (trainingData, testData) = data.randomSplit([0.7, 0.3]) # convert RDD to LabelPoint trainingDataRDD = trainingData.rdd.map(rdd2label_point) testDataRDD = testData.rdd.map(rdd2label_point) # info about categorical features category = {1: 2, 4: 2, 6: 2, 7: 4, 10: 2, 11: 2, 12: 2} # Train a DecisionTree model decision_tree_model = DecisionTree.trainClassifier(trainingDataRDD, numClasses=2, categoricalFeaturesInfo=category, impurity='gini', maxDepth=5, maxBins=12) # Train a RandomForest model random_forest_model = RandomForest.trainClassifier(trainingDataRDD, numClasses=2, categoricalFeaturesInfo=category, impurity='gini', maxDepth=5, maxBins=12, numTrees=7, featureSubsetStrategy="auto") # the features of the test data testDataFeatureRDD = testDataRDD.map(lambda x: x.features) testDataCount = testData.count() # predict the result decision_tree_prediction = decision_tree_model.predict(testDataFeatureRDD).collect() random_forest_prediction = random_forest_model.predict(testDataFeatureRDD).collect() # metric
vector) #se precisar de feature do Feature Selection data = pass2libsvm(reduced, sc.parallelize(classes)) #para a (5-tupla deveria ser algo como ) data=pass2libsvm(vector) (trainingData, testData) = data.randomSplit([0.7, 0.3]) print 'data devided' #trainingData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-16.out',15)) #testData = CorrelationFeature(sc.textFile('hdfs://master:9000/user/app/classes-25.out',15)) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainClassifier(trainingData, numberClasses, {}) #, maxDepth=5, maxBins=32) # let lrm be a LogisticRegression Model #model.save(sc, "hdfs://master:9000/user/app/model-"+str(sys.argv[2]+".model")) print 'model done' #to load the model #sameModel = DecisionTreeModel.load(sc, "lrm_model.model") # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) metrics = MulticlassMetrics(labelsAndPredictions)
# Carga el conjunto de datos sc = SparkContext(appName="trees3") text = sc.textFile("home\cbank.data") data = (text.map(lambda l : l.split('\t')) .map(lambda v : [ int(x.replace("A", "")) for x in v ]) .map(lambda (a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, c) : (a1, a2-1, a3-1, a4-1, a5-1, a6, a7-1, a8-1, a9-1, a10, a11-1, a12, a13, a14, a15, a16-1, c-1)) .map(lambda v : LabeledPoint(v[-1], v[:-1]))) # Divide los datos en un conjunto de entrenamiento y test (70% - 30% respectivamente) (trainData, testData) = data.randomSplit([0.7, 0.3]) # Entrena el modelo con el árbol de decisión. model = DecisionTree.trainClassifier( trainData, numClasses=2, categoricalFeaturesInfo={1:12, 2:3, 3:4, 4:2, 6:2, 7:2, 8:3, 10:12, 15:4}, impurity='entropy', maxDepth=3) # Evalua el modelo para saber el porcentaje de aciertos. predictions = model.predict(testData.map(lambda lp : lp.features)) results = testData.map(lambda lp : lp.label).zip(predictions) acc = (results.filter(lambda (v, p): v == p) .count()) / float(testData.count()) print('% Aciertos: ' + str(acc * 100)) # Calcula otras métricas tp = results.filter(lambda (v, p): v == 1 and p == 1).count() tn = results.filter(lambda (v, p): v == 0 and p == 0).count()
#changing into a list to_csv = rdd.map(lambda x: x.split(",")) to_csv.collect() #splitting into train,test train,test = to_csv.randomSplit([0.8,0.2],seed=42) #converting the data to be optimised to be fed into our Decision Tree classfier training_data = train.map(lambda x: LabeledPoint(x[8],array([x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7]]))) training_data.collect() #extracting the labels test_labels = test.map(lambda x: float(x[8])) test_labels.collect() model = DecisionTree.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={}, impurity='gini', maxDepth=6, maxBins=40) test_results = model.predict(test) #print('Diabetic Predictions:') #results = test_results.collect() #for result in results: # print(result) # We can also print out the decision tree itself: print('Learned classification tree model:') print(model.toDebugString()) #zipping the labels from test data (test_y) and the predictions made on test_X labelsAndPredictions = test_labels.zip(test_results) labelsAndPredictions.collect()
# In[8]: from pyspark.mllib.tree import DecisionTree, DecisionTreeModel import utils_mesure data = sc.textFile("file:/C:/spark-1.6.0-bin-hadoop2.4/"+nomF_svm+".csv") # suppression du header nomColInit = data.first() data2 = data.filter(lambda line: nomColInit != line) data = data2.map(utils_mesure.parseLine) # Echantillonnage 60% entrainement et 40% test training, test = data.randomSplit([0.6, 0.4], seed=0) # Construction du modèle model = DecisionTree.trainClassifier(training, numClasses=7, categoricalFeaturesInfo={}, impurity='entropy', maxDepth=10, maxBins=32) # Test predictions = model.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count()) print('Test Error = ' + str(testErr)) print('Learned classification tree model:') print(model.toDebugString()) # In[8]: # Mesures globales du mmodèle utils_mesure.tabSum(labelsAndPredictions, 7, 'Decision Tree')
klass, (1 if age == 'adults' else 0), (1 if sex == 'women' else 0) ] return LabeledPoint(1 if survived == 'yes' else 0, features) labeled_points_rdd = data_rdd.map(row_to_labeled_point) labeled_points_rdd.takeSample(False, 5, 0) training_rdd, test_rdd = labeled_points_rdd.randomSplit([0.7, 0.3], seed = 0) training_count = training_rdd.count() test_count = test_rdd.count() training_count, test_count model = DecisionTree.trainClassifier( training_rdd, numClasses=2, categoricalFeaturesInfo={0: 3,1: 2,2: 2}) predictions_rdd = model.predict(test_rdd.map(lambda x: x.features)) truth_and_predictions_rdd = test_rdd.map(lambda lp: lp.label).zip(predictions_rdd) accuracy = truth_and_predictions_rdd.filter(lambda v_p: v_p[0] == v_p[1]).count() / float(test_count) print('Accuracy =', accuracy) print(model.toDebugString()) model = LogisticRegressionWithSGD.train(training_rdd) predictions_rdd = model.predict(test_rdd.map(lambda x: x.features)) labels_and_predictions_rdd = test_rdd.map(lambda lp: lp.label).zip(predictions_rdd)
# In[ ]: # In[53]: (trainingData, testData) = fdata.randomSplit([0.8, 0.2]) # Use the decision tree classifier to train the model # In[54]: from pyspark.mllib.tree import DecisionTree # In[55]: model = DecisionTree.trainClassifier(trainingData, numClasses=3, categoricalFeaturesInfo={}) # In[56]: predictions = model.predict(testData.map(lambda row: row.features)) # Create Confusion Matrix to evaluate the accuracy of the model # We create a matrix containing the test labels as a first column (real values) and predicted values as second column # In[57]: predictionsAndLabels = testData.map( lambda labeledpoint: labeledpoint.label).zip(predictions)