-
Notifications
You must be signed in to change notification settings - Fork 0
/
SVM.py
149 lines (109 loc) · 5.59 KB
/
SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from sklearn import svm, linear_model
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from skll.metrics import kappa
import DataTasks as dt
import MovieTasks as mt
from scipy import stats, linalg, spatial
class SVM:
def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200,
training_data=10000, amount_of_scores=400, low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000):
print "getting movie data"
movie_vectors = dt.importVectors(vector_path)
movie_labels = dt.importLabels(class_path)
print "getting file names"
file_names = dt.getFns(class_path[:-10])
print len(movie_labels), len(movie_labels[0])
print "getting training and test data"
x_train = np.asarray(movie_vectors[:training_data])
x_test = np.asarray(movie_vectors[training_data:])
movie_labels = zip(*movie_labels)
file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut)
movie_labels = zip(*movie_labels)
y_train = movie_labels[:training_data]
y_test = movie_labels[training_data:]
y_train = np.asarray(zip(*y_train))
y_test = np.asarray(zip(*y_test))
print len(y_train), len(y_test), training_data
print "getting kappa scores"
kappa_scores, directions = self.runAllSVMs(y_test, y_train, x_train, x_test, file_names)
dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores")
dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names")
dt.write2dArray(directions, "directions/"+name_distinction+".directions")
"""
We have used the LIBSVM 27 implementation. Because default values of the parameters yielded very poor results,
we have used a grid search procedure to find the optimal value of the C parameter for every class. To this end, the
training data for each class was split into 2/3 training and 1/3 validation. Moreover, to address class imbalance we
under-sampled negative training examples, such that the ratio between positive and negative training examples
was at least 1/2.
"""
def runRankSVM(self, y_test, y_train, x_train, x_test, class_type, input_size, file_names, keyword):
clf = svm.SVC(kernel='linear', C=.1)
clf.fit(x_train[keyword], y_train[keyword])
#clf.decision_function(x_test)
direction = clf.coef_
return direction
def runSVM(self, y_test, y_train, x_train, x_test):
clf = svm.LinearSVC(class_weight="auto")
clf.fit(x_train, y_train)
direction = clf.coef_.tolist()[0]
y_pred = clf.predict(x_test)
y_pred = y_pred.tolist()
kappa_score = kappa(y_test, y_pred)
return kappa_score, direction
def runAllSVMs(self, y_test, y_train, x_train, x_test, file_names):
kappa_scores = []
directions = []
for y in range(len(y_train)):
kappa, direction = self.runSVM(y_test[y], y_train[y], x_train, x_test)
kappa_scores.append(kappa)
directions.append(direction)
print y, kappa, file_names[y]
return kappa_scores, directions
def rankByDirections(self, movie_names, movie_vectors, file_names, directions):
dict = {}
for d in range(len(directions)):
unsorted_ranks = []
for v in range(len(movie_vectors)):
unsorted_ranks.append(linalg.norm(directions[d] * movie_vectors[v]))
unsorted_ranks = np.asarray(unsorted_ranks)
sorted_ranks = np.argpartition(unsorted_ranks, -len(directions))[-len(directions):]
top_ranked_movies = []
for s in sorted_ranks:
top_ranked_movies.append(movie_names[s])
dict[file_names[d]] = top_ranked_movies
return dict
def getSampledData(self, file_names, movie_labels, amount_to_cut_at, largest_cut):
print len(movie_labels)
print len(movie_labels[0])
for yt in range(len(movie_labels)):
y1 = 0
y0 = 0
for y in range(len(movie_labels[yt])):
if movie_labels[yt][y] == 1:
y1 += 1
if movie_labels[yt][y] == 0:
y0 += 1
if y1 < amount_to_cut_at or y1 > largest_cut:
print yt, "len(0):", y0, "len(1):", y1, "DELETED", file_names[yt]
movie_labels[yt] = None
file_names[yt] = None
continue
print yt, "len(0):", y0, "len(1):", y1, file_names[yt]
file_names = [x for x in file_names if x is not None]
movie_labels = [x for x in movie_labels if x is not None]
return file_names, movie_labels
def main():
path="newdata/spaces/"
#path="filmdata/films200.mds/"
#array = ["700", "400", "100"]
filenames = ["films100N0.6H75L1", "films100N0.6H50L2", "films100N0.6H25L3",
"films100N0.6H50L4", "films100N0.6H75L5", "films100N0.6H100L6"]
"""
"AUTOENCODER0.2tanhtanhmse15tanh[1000]4SDA1","AUTOENCODER0.2tanhtanhmse60tanh[200]4SDA2","AUTOENCODER0.2tanhtanhmse30tanh[1000]4SDA3",
"AUTOENCODER0.2tanhtanhmse60tanh[200]4SDA4"
"""
cut = 100
for f in range(len(filenames)):
newSVM = SVM(vector_path=path+filenames[f]+".mds", class_path="filmdata/classesPhrases/class-All", amount_to_cut_at=cut, training_data=10000, name_distinction=filenames[f]+"LS", largest_cut=9999999999)
if __name__ =='__main__':main()