-
Notifications
You must be signed in to change notification settings - Fork 0
/
iteroesvm.py
337 lines (315 loc) · 13.5 KB
/
iteroesvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
from __future__ import division
import re
from iterPreprocess import ctgryName,freqByCategory,wordToSynset, freqByService
from hierachy_tree import chooseSimKSynsets
import numpy as np
import math
import hashlib
from datetime import datetime
from setting import db_connection
import copy
import sys
from collections import Counter
from nltk.corpus import wordnet as wn
from subprocess import *
from cutrow import cutrow
dbRepo = db_connection['PW_test']
dboesvm = db_connection['oesvm']
dbsoesvm = db_connection['soesvm']
dbTrain = db_connection['trainSet']
dbTest = db_connection['testSet']
loop = 0
isStop = False
rankList = []
signature = hashlib.md5(str(datetime.now())).hexdigest()
#todo: if you want to compare between oesvm soesvm, init training set should be same. so another method should be written for copy api from soesvm initTrain table to oesvm initTrain table
#set up init Training set and copy the whole repository frequency table for testing
def consInitTrainSetAndTestSet(category, testPercent, db):
ctgryApis = list(db.apis.find({'category':'Travel'}))
nonCtgryApis = list(db.apis.find({'category':{'$ne':'Travel'}}))
dbTrain.apis.drop()
dbTest.apis.drop()
halfTrainSetSize = 100
for i in range(halfTrainSetSize):
index = np.random.randint(0, len(ctgryApis), 1)
dbTrain.apis.insert(ctgryApis[index])
del ctgryApis[index]
for i in range(halfTrainSetSize):
index = np.random.randint(0, len(nonCtgryApis), 1)
dbTrain.apis.insert(nonCtgryApis[index])
del nonCtgryApis[index]
allApis = nonCtgryApis + ctgryApis
testSetSize = 100
for i in range(testSetSize):
index = np.random.randint(0, len(allApis), 1)
dbTest.apis.insert(allApis[index])
del allApis[index]
#this method cacultes kfirf for all categories, store as wordKfirf table for word, as synsetKfrif for synset (isSynset and !isWord)
# all tables used are in db
def kfirf(category, alpha, isWord, db):
if isWord:
freqbyCtgryTable = db.freqbyCtgry
db.wordKfirf.drop()
else:
freqbyCtgryTable = db.synsetFreqbyCtgry
db.synsetKfirf.drop()
ctgryCount = freqbyCtgryTable.count()
for entry in freqbyCtgryTable.find():
kfirfEntry = copy.deepcopy(entry)
cnt = Counter(entry['wordlist'])
maxFreq = cnt.most_common()[0][1]
for word in cnt:
ctgryWithWord = 0
totalCount = 0
#print word
for i in freqbyCtgryTable.find({'wordlist.' + word :{'$exists':True}}):
ctgryWithWord += 1
totalCount += i['wordlist'][word]
#print cnt[word],'/', maxFreq, '*( alpha * (1 -', ctgryWithWord, '/', ctgryCount, ') + (1 - alpha) * ', cnt[word], '/', totalCount, ')'
kfirfEntry['wordlist'][word] = cnt[word]/maxFreq * (alpha * (1 - ctgryWithWord/ctgryCount) + (1 - alpha) * cnt[word]/totalCount)
#print word, kfirfEntry['wordlist'][word]
if isWord:
db.wordKfirf.insert(kfirfEntry)
else:
db.synsetKfirf.insert(kfirfEntry)
#this method caculates kf-idfdf for all the key words in db frequency table regarding to a category(param)
def kfidfdf(beta, category, omega, isSynset):
if isSynset:
freqTable = dbTrain.synsetFrequency
db = dbsoesvm
rankList = dbTrain.synsetKfirf.find({'category': category})[0]['wordlist']
else:
freqTable = dbTrain.frequency
db = dboesvm
rankList = dbTrain.wordKfirf.find({'category': category})[0]['wordlist']
db.kfidfdf.drop()
db.tfidf.drop()
# sum all apis
documentTotalNumber = freqTable.find().count()
#wordSet is for generate index for words
wordSet = set()
for entry in freqTable.find():
for word in entry['wordlist']:
wordSet.add(word)
wordSet = list(wordSet)
#wordIndexMap is used by new documents which are not in repository
db.wordIndexMap.drop()
db.documentFreq.drop()
dfCounter = Counter()
for entry in freqTable.find():
dfCounter += Counter([word for word in entry['wordlist']])
documentFreqDict = dict(dfCounter)
db.documentFreq.insert(documentFreqDict)
for word in wordSet:
db.wordIndexMap.insert({'word':word, 'index': wordSet.index(word) + 1})
rankList = Counter(rankList)
if omega < len(rankList):
omega = len(rankList) - 1
most_commonList = rankList.most_common()
maxKfirf = most_commonList[omega][1]
#calculate kfidfdf for all words in repository
for entry in freqTable.find():
del entry['_id']
totalFreq = 0
for word in entry['wordlist']:
#sum all keywords in one api frequency
totalFreq += entry['wordlist'][word]
kfidfdfEntry = {'api':'','vector':{}}
tfidfEntry = {'api':'','vector':{}}
for word in entry['wordlist']:
#calculate api with this word count in all the repos
wordDocumentFreq = documentFreqDict[word]
tfidf = entry['wordlist'][word]/totalFreq * math.log( documentTotalNumber / (wordDocumentFreq + 1 ), 10)
tfidfEntry['vector'][str(wordSet.index(word)+1)] = [tfidf, word]
#if (word == 'travel' or word == 'amaze') and (entry['api_id'] == 'http://www.programmableweb.com/api/cleartrip'):
#print word, entry['api_id'], "tfidf = ", tfidf, "word freq in api = ", entry['wordlist'][word], "totalFreq =", totalFreq, "log(", documentTotalNumber, "/(", wordDocumentFreq, "+1))"
if maxKfirf > rankList.get(word, -1):
kfidfdfEntry['vector'][str(wordSet.index(word)+1)] = [tfidf, word]
else:
rank = [k[0] for k in most_commonList].index(word)
kfidfdfEntry['vector'][str(wordSet.index(word)+1)] = [tfidf * (1 + (1 - math.floor(rank / math.sqrt( omega )) / math.sqrt( omega ) ) * beta), word]
kfidfdfEntry['api'] = entry['api_id']
tfidfEntry['api'] = entry['api_id']
db.kfidfdf.insert(kfidfdfEntry)
db.tfidf.insert(tfidfEntry)
#generate files for training and experiments.(oe)svm_train is the training set. (oe)svm_test is the test set
#(oe)svm[true false]_test is test sets generated from the whole test set. One's category is Target, while the other one is not
def generateFilesforSvm(category, svmType, isSynset, db):
#todo: add support for isSynset is False. Actually it should work well now. However each time, we should refresh the oesvm kfidf kfirf, kfidfdf every time we change between synset or not synset(word)
if isSynset:
path = './rawdataset/master/synset/'
else:
path = './rawdataset/master/word/'
if svmType == 'svm':
table = db.tfidf
else:
table = db.kfidfdf
train = open(path + svmType + 'train', 'w')
true_train = open(path + svmType + 'true_train', 'w')
false_train = open(path + svmType + 'false_train', 'w')
#first loop, train and test are different, train is small. test is the whole repository
for entry in dbTrain.frequency.find():
f = train
t_f = true_train
f_f = false_train
vectorEntry = table.find({'api':entry['api_id']})[0]
if category == entry['category']:
f.write(vectorEntry['api'] + ' 1')
t_f.write(vectorEntry['api'] + ' 1')
for key in sorted(int(k) for k in vectorEntry['vector']):
t_f.write(' ' + str(key) + ':' + str(vectorEntry['vector'][str(key)][0]))
t_f.write('\n')
else:
f.write(vectorEntry['api'] + ' 0')
f_f.write(vectorEntry['api'] + ' 0')
for key in sorted(int(k) for k in vectorEntry['vector']):
f_f.write(' ' + str(key) + ':' + str(vectorEntry['vector'][str(key)][0]))
f_f.write('\n')
for key in sorted(int(k) for k in vectorEntry['vector']):
f.write(' ' + str(key) + ':' + str(vectorEntry['vector'][str(key)][0]))
f.write('\n')
#This method builds new Synset Frequency table using db.frequency table
def frequencySynset(db):
db.synsetFrequency.drop()
query = {}
f = open('XXXX', 'w')
for entry in db.frequency.find(query, timeout = False):
newWordlist = {}
for word in entry['wordlist']:
if db.wordSynsetMap.find({'word': word, 'category': entry['category']}).count():
synset = db.wordSynsetMap.find({'word': word, 'category': entry['category']})[0]['synset']
newWordlist[synset.replace('.','__')] = newWordlist.get(synset.replace('.','__'), 0) + entry['wordlist'][word]
else:
print 'XXX'
f.write(word+' '+entry['category']+'\n')
#because when conducting real test and training. Words in test set not always in train set, so we should assign a synset for it.
cnt = Counter({synet: sum(db.wordKfirf.find({'category':entry['category']})[0]['wordlist'].get(lemma.name, 0) for lemma in wn.synset(synset).lemmas) for synset in chooseSimKSynsets(word, 3, category = ctgryName.get(entry['category'], entry['category']))})
synset = cnt.most_common()[0]
newWordlist[re.sub('\.','__',synset)] = newWordlist.get(re.sub('\.','__',synset), 0) + entry['wordlist'][word]
entry['wordlist'] = newWordlist
db.synsetFrequency.insert(entry)
#This method can generate a modelFile using trainFile, test testFile with the model and output result to predictTestFile
def svmHelper(trainFile, testFile, modelFile, predictTestFile):
cmd = 'svm-grid "{0}"'.format(trainFile)
print('Cross validation...')
p = Popen(cmd, shell = True, stdout = PIPE)
p.wait()
f = p.stdout
line = ''
while True:
last_line = line
line = f.readline()
if not line: break
c,g,rate = map(float,last_line.split())
print('Best c={0}, g={1} CV rate={2}'.format(c,g,rate))
cmd = 'svm-train -c {0} -g {1} "{2}" "{3}"'.format(c, g, trainFile, modelFile)
print('Training...')
p = Popen(cmd, shell = True, stdout = PIPE)
p.communicate()
p.wait()
cmd = 'svm-predict "{0}" "{1}" "{2}"'.format(testFile, modelFile, predictTestFile)
print('Testing...')
p = Popen(cmd, shell = True)
p.communicate()
p.wait()
print('Output prediction: {0}'.format(predictTestFile))
#This method check whether the ranklist is stable. If the ranklist is stable, the iteration can stop.
def checkStability(db, category, isSynset):
global rankList, isStop
f_ranklist = open('./ranklist/ranklist-'+ signature + '-' +str(loop), 'w')
if isSynset:
table = db.synsetKfirf
else:
table = db.wordKfirf
newRankList = []
for entry in table.find({'category':category}):
cnt = Counter(entry['wordlist'])
newRankList = [pair[0] for pair in cnt.most_common()]
length = min(150, len(newRankList), len(rankList))
if newRankList == rankList:
isStop = True
print 'stop!'
else:
rankList = newRankList
f_ranklist.write(' '.join(rankList)+'\n')
#use this function every time you classify a new category or you change any formula
def initialize():
global category
consInitTrainSetAndTestSet(category, 0.2, dbRepo)
freqByService(dbTrain)
freqByCategory(dbTrain.frequency, dbTrain.freqbyCtgry)
kfirf(category, 0.4, True, dbTrain)
wordToSynset(dbTrain)
frequencySynset(dbTrain)
freqByCategory(dbTrain.synsetFrequency, dbTrain.synsetFreqbyCtgry)
kfirf(category, 0.4, False, dbTrain)
#kfidfdf is only for category's api
kfidfdf(0.5, category, 100, True)
kfidfdf(0.5, category, 100, False)
category = 'Travel'
initialize()
generateFilesforSvm('Travel', 'oesvm', False, dboesvm)
generateFilesforSvm('Travel', 'svm', False, dboesvm)
generateFilesforSvm('Travel', 'soesvm', True, dbsoesvm)
generateFilesforSvm('Travel', 'svm', True, dbsoesvm)
cutrow()
svmHelper('./dataset/master/word/oesvmtrain', './dataset/master/word/oesvmtrain', './model/master/modelforoesvm', 'predict_test_result')
svmHelper('./dataset/master/word/svmtrain', './dataset/master/word/svmtrain', './model/master/modelforsvm', 'predict_test_result')
svmHelper('./dataset/master/synset/svmtrain', './dataset/master/synset/svmtrain', './model/master/modelforsynsetsvm', 'predict_test_result')
svmHelper('./dataset/master/synset/soesvmtrain', './dataset/master/synset/soesvmtrain', './model/master/modelforsoesvm', 'predict_test_result')
"""
#Loop
isSynset = True
category = 'Travel'
isInit = True
if isInit:
initialize()
if isSynset:
db = dbsoesvm
svmType = 'synset'
else:
db = dboesvm
svmType = 'word'
checkStability(db, category, isSynset)
while not isStop:
if loop > 0:
kfidfdf(0.5, category, 100, True, db)
generateFilesforSvm(category, 'oesvm', isSynset, db)
cutrow()
svmHelper('./dataset/iteration/'+ svmType +'/oesvmtrain', './dataset/iteration/' + svmType + '/oesvmtest', './model/iteration/modelforsoesvm', 'predict_test_result')
f_test = open('./rawdataset/iteration/'+ svmType + '/oesvmtest')
f_result = open('./predict_test_result')
#a dict for map lineNum to service <'lineNum':'api_id'>
lineNumToService = {}
lineNum = 0
for line in f_test:
ID = line.split(' ')[0]
lineNumToService[lineNum] = ID
lineNum += 1
lineNum = 0
for line in f_result:
if db.frequency.find({'api_id':lineNumToService[lineNum]}, {'category':1})[0]['category'] == category:
originCategory = 1
else:
originCategory = 0
if originCategory == 1 and line.rstrip('\n') == '0':
#remove original travel now non-travel service, because we cannot assign a category for it
db.frequency.remove({'api_id':lineNumToService[lineNum]})
db.synsetFrequency.remove({'api_id':lineNumToService[lineNum]})
print 'remove', lineNumToService[lineNum]
if originCategory == 0 and line.rstrip('\n') == '1':
db.frequency.update({'api_id':lineNumToService[lineNum]}, {'$set':{'category': category}})
db.synsetFrequency.update({'api_id':lineNumToService[lineNum]}, {'$set':{'category': category}})
print 'update', lineNumToService[lineNum]
lineNum += 1
#re-build all the table
freqByCategory(db.frequency, db.freqbyCtgry)
kfirf(category, 0.4, isSynset, True, db)
wordToSynset(db)
frequencySynset(db)
freqByCategory(db.synsetFrequency, db.synsetFreqbyCtgry)
kfirf(category, 0.4, isSynset, False, db)
loop += 1
print loop
checkStability(db, category, isSynset)
"""