-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainerD115.py
38 lines (29 loc) · 1.67 KB
/
trainerD115.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import sys
import pickle
from numpy import array
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
trainDataPath = "s3://cloudpa2/TrainingDataset.csv"
valDataPath = "s3://cloudpa2/ValidationDataset.csv"
sc = SparkContext.getOrCreate();
sc.getConf().setAppName('cloudpa2')
rawData = sc.textFile(trainDataPath)
records = rawData.filter(lambda str: "\"" not in str).map(lambda str: str.split(";")).map( lambda strVal : [float(x) for x in strVal])
data = records.map(lambda arr : LabeledPoint(int(arr[-1]) - 1, Vectors.dense(arr[:-1])))
valRawData = sc.textFile(valDataPath)
valRecords = valRawData.filter(lambda str: "\"" not in str).map(lambda str: str.split(";")).map( lambda strVal : [float(x) for x in strVal])
valData = valRecords.map(lambda arr: (arr[:-1], int(arr[-1]) - 1))
rfModel = RandomForest.trainClassifier(data, numClasses=10, categoricalFeaturesInfo={}, numTrees=100)
rfModelPredictionAndLabels = rfModel.predict(valData.map(lambda tp : tp[0])).zip(valData.map(lambda _: float(_[1])))
rfModelMetric = MulticlassMetrics(rfModelPredictionAndLabels)
print("F1Score : %s"%(rfModelMetric.accuracy))
rfModel.save(sc, "s3://myprogrambucket123/rfwine_model.model")