def task_C(df_train, df_test, sc, sqlContext, pipe, features_col): nb = NaiveBayes(smoothing=1.0, modelType="multinomial").setFeaturesCol( features_col).setLabelCol('class') dtc = DecisionTree() maem = [] maeni = [] maem_dt = [] maeni_dt = [] i = 0 print("Numero di topic da processare:", len(np.unique(df_train.topic))) for topic in np.unique(df_train.topic): if i >= 125: pred = pd.DataFrame(columns=['class', 'prediction']) index_test = getSimilar(df_train, df_test, topic, test_size=0.3)[0] tt_set = pd.concat( [df_train[df_train.topic == topic], df_test.iloc[index_test]]) tt_set['class'] = tt_set['class'].map(lambda x: x + 2) df_f = sqlContext.createDataFrame(tt_set) tr3_test = pipe.fit(df_f).transform(df_f) # Passare al cross validator un tipo (true albero, false NaiveBayes) maem_aux, maeni_aux = cross_validation_task_C( tr3_test.toPandas(), nb, sqlContext, False, features_col, sc) maem.append(maem_aux) maeni.append(maeni_aux) maem_aux, maeni_aux = cross_validation_task_C( tr3_test.toPandas(), dtc, sqlContext, True, features_col, sc) maem_dt.append(maem_aux) maeni_dt.append(maeni_aux) # train_topic = tr3_test.filter(tr3_test['topic'] == topic) # test_topic = tr3_test.filter(tr3_test['topic'] != topic) # categorie # Topic processati: 150 # 2.17504068541 1.18580006402 1.31400883186 0.601955388286 i = i + 1 if i % 25 == 0: print("Topic processati:", i) print(np.mean(maem), np.mean(maeni), np.mean(maem_dt), np.mean(maeni_dt)) print( "NaiveBayes maem, NaiveBayes maeni, DecisionTree maem, DecisionTree maeni" ) return (np.mean(maem), np.mean(maeni), np.mean(maem_dt), np.mean(maeni_dt))
algo = SVMWithSGD() model = algo.train(training_data) score(model) ##### Trees ##### ##### Now let’s try three variants of tree-based classification. ##### The API is slightly different from previous algos. from pyspark.mllib.tree import DecisionTree from pyspark.mllib.tree import GradientBoostedTrees from pyspark.mllib.tree import RandomForest algo = DecisionTree() model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={}) score(model) algo = GradientBoostedTrees() model = algo.trainClassifier(training_data,categoricalFeaturesInfo={},numIterations=10) score(model) algo = RandomForest() model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16) score(model) #### Naive Bayes #### Last but not least, let’s try the Naives Bayes classifier. from pyspark.mllib.classification import NaiveBayes
num_iterations = 10 max_tree_depth = 5 lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations) print("logistic regression model :") print(lr_model) svm_model = SVMWithSGD().train(data, num_iterations) print("svm model :") print(svm_model) nb_model = NaiveBayes().train(nb_data) print("naive bayes model :") print(nb_model) dt_model = DecisionTree().trainClassifier(data, 2, {}) print("decision tree model :") print(dt_model) #start predict data_point = data.first() lr_prediction = lr_model.predict(data_point.features) print("logistic model prediction :" + str(lr_prediction)) print("the true label :" + str(data_point.label)) #analyze data vectors = data.map(lambda lp: lp.features) matrix = RowMatrix(vectors) matrix_summary = matrix.computeColumnSummaryStatistics() print("the col mean of matrix :") print(matrix_summary.mean())
from pyspark import SparkContext, SparkConf from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Decision Tree Regression').setMaster('local[2]') sc = SparkContext(conf=conf) # load data data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt') # split the data into training and test sets (training, testData) = data.randomSplit([0.7, 0.3]) # training a decision tree regression model = DecisionTree().trainRegressor(training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) # evaluate model on test instance and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelAndPredictions = testData.map(lambda x: x.label).zip(predictions) testMSE = labelAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float( testData.count()) print('test mean squared error :' + str(testMSE)) print('learned regression tree model :') print(model.toDebugString()) # save and load model model.save(sc, '../model/myDecisionTreeRegressionModel')