-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluationSVM.py
122 lines (95 loc) · 4.94 KB
/
evaluationSVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import loadFiles as lf
import preProcess as pp
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from random import randint
from pyspark.mllib.classification import NaiveBayes
from pyspark import SparkContext
from pyspark import SparkFiles
from functools import partial
from pyspark.mllib.classification import LogisticRegressionWithSGD,SVMWithSGD,SVMModel
from pyspark.mllib.tree import RandomForest
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import SVMWithSGD, SVMModel
#### TODO change to the cluster directory
trainF="/home/xavier.callens/DataCamp/train" #the path to where the train data is
sc = SparkContext(appName="Simple App") #initialize the spark context
#since we are not in the command line interface we need to add to the spark context
#some of our classes so that they are available to the workers
sc.addFile("helpers.py")
sc.addFile("exctract_terms.py")
#now if we import these files they will also be available to the workers
from helpers import *
import extract_terms as et
# load data : data is a list with the text per doc in each cell. Y is the respective class value
#1 :positive , 0 negative
print "loading local data"
data,Y=lf.loadLabeled(trainF)
print "preprocessing"
pp.proc(data) #clean up the data from number, html tags, punctuations (except for "?!." ...."?!" are replaced by "."
m = TfidfVectorizer(analyzer=et.terms) # m is a compressed matrix with the tfidf matrix the terms are extracted with our own custom function
'''
we need an array to distribute to the workers ...
the array should be the same size as the number of workers
we need one element per worker only
'''
ex=np.zeros(8)
rp=randint(0,7)
ex[rp]=1 #one random worker will be selected so we set one random element to non-zero
md=sc.broadcast(m) #broadcast the vectorizer so that he will be available to all workers
datad=sc.broadcast(data) # broadcast teh data
#execute vectorizer in one random remote machine
#partial is a python function that calls a function and can assign partially some of the parameters
#numSlices determins how mnay partitions should the data have
#numslices is also helpfull if we want to reduce the size of each task for each worker
tmpRDD=sc.parallelize(ex,numSlices=8).filter(lambda x: x!=0).map(partial(compute, model=md, data=datad))
print "transforming the data in a remote machine"
data=tmpRDD.collect() # get back the coordinate matrix and the fitted vectorizer
#data = [[[matrix][vectorizer]]] (double nested)
tfidf_coo=data[0][0]
m=data[0][1] #the fitted vectorizer re-assign it just in case
datad.unpersist() # we don't need this broadcasted variable anymore
#distribute the coordinate data
# data =[ [value,row_index,column_index],[value,row_index,column_index]..]
ttodp=sc.parallelize(tfidf_coo,512)
comb = (lambda x,y : np.vstack([np.array(x),np.array(y)])) # a function to combine tuples into a "vertical" array
#organize the coordinate matrix into the row index and a tuple containing the value and column index
#group by the row index
tmp=ttodp.map(lambda x: (x[1],(x[0],x[2]))).aggregateByKey([0,0],comb,comb)
bY=sc.broadcast(Y) #broadcast the class variable (in order to create labeled points)
# the number of features is the columns of the matrix
#we need this information to convert to vectors and label point the coordinate data
cols=sc.broadcast(len(m.get_feature_names()))
print "number of features"+str(cols.value)
#convert to labeled point in parallel
tmpLB=tmp.map(partial(toLB,cols=cols,class_v=bY))
print "splitting the data"
train, test = tmpLB.randomSplit([0.6, 0.4], seed = 0)
print "training the machine learning algorithm"
#Change ---------------------------------
#model = NaiveBayes.train(train, 1.0)
### Change DONE
model=SVMWithSGD.train(train, 1.0)
### Change XCA
# TODO We are testing several MLs
# 1) LogisticsRegression
#model =LogisticRegressionwWithSGD.train(train) This is used for Logistic regression classification
# 2) SVM Classification
#model=SVMWithSGD.train(train) This used for SVM classiffication
# 3) RandomForest
#************Random forest model in pyspark is experimental so not sure whether works perfectly or not
#model=RandomForest.trainClassifier(train,2,{},300,seed=2) here 300 is best solution as per literature for this dataset
print "retrieving predictions and evaluating"
predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print "accuracy for SVM:"+str(accuracy)