-
Notifications
You must be signed in to change notification settings - Fork 0
/
svm.py
57 lines (43 loc) · 1.62 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext
sc = SparkContext()
#load and parse the data
def parsePoint(line):
values = [np.float(x) for x in line.replace(',', ' ').split(' ')]
return LabeledPoint(values[0], values[1:])
traindata = sc.textFile("/user/cloudera/hw2/train_hw2.csv")
testdata = sc.textFile("/user/cloudera/hw2/test_hw2.csv")
parsedTrainData = traindata.map(parsePoint)
parsedTestData = testdata.map(parsePoint)
#Build the model with train data
model = SVMWithSGD.train(parsedTrainData, iterations=100, step=1.0, regParam=0.01, regType="l2")
#Evaluate the model on test data
labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features)))
a = labelsAndPreds.filter(lambda lp: lp[0]==1 and lp[1]==1).count()
b = labelsAndPreds.filter(lambda lp: lp[0]==1 and lp[1]==0).count()
c = labelsAndPreds.filter(lambda lp: lp[0]==0 and lp[1]==1).count()
d = labelsAndPreds.filter(lambda lp: lp[0]==0 and lp[1]==0).count()
print("Confusion Matrix: ")
print("TP = " + str(a))
print("FN = " + str(b))
print("FP = " + str(c))
print("TN = " + str(d))
print("\n")
#Calculation
a = np.float(a)
b = np.float(b)
c = np.float(c)
d = np.float(d)
accuracy = (a+d) / (a+b+c+d)
precision = a / (a+c)
recall = a / (a+b)
f1 = 2*a / (2*a+b+c)
print('Accuracy: %f' %accuracy)
print('Precision: %f' %precision)
print('Recall: %f' %recall)
print('F1: %f' %f1)
#save and load model
model.save(sc, "/user/cloudera/hw2/results/2015310884_SVM")
sameModel = SVMModel.load(sc, "/user/cloudera/hw2/results/2015310884_SVM")