def runSVM(C, seednum=18283835): #initialize with seed so we get same training and test sets every time # i.e. for cross validation if seednum == '-1': random.seed() else: random.seed(seednum) # Throwaway test to classify browser vs bot frac = 0.8 # Open the database conn = sqlite3.connect("../mydb") conn.text_factory = str c = conn.cursor() # Data trainingData = [] testData = [] # read # of tokens ftokens = open('../../data/tokens.txt', 'r') keys = [] for line in ftokens: s = line.strip().rstrip() token, sep, number = s.partition(' ') number = int(number) + 1 keys.append(number) print 'NUMBER OF TOKENS: ', max(keys) maxtok = max(keys) print 'NUM TOKENS USED ', maxtok #get Families c.execute( 'select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL' ) # c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE" OR "Family" = "Chrome" OR "Family" = "Galeon" OR "Family" = "Konqueror" OR "Family" = "Opera" OR "Family" = "Safari" OR "Family" = "AOL Explorer" OR "FamilY" = "Maxthon" OR "Family" = "Avant Browser" OR "Family" = "IceWeasel" OR "Family" = "Mozilla" OR "Family" = "SeaMonkey" OR "Family" = "Netscape Navigator" OR "Family" = "Flock" OR "Family" = "Camino" OR "Family" = "Crazy Browser" OR "Family" = "CometBird" OR "Family" = "Epiphany" OR"Family" = "TheWorld Browser" OR "Family" = "K-Meleon" OR "Family" = "Sleipnir" OR "Family" = "Iron" OR "Family" = "Swiftfox" OR "Family" = "Acoo Browser" OR "Family" = "TT Explorer" OR "Family" = "GreenBrowser" OR "Family" = "Lunascape" OR "Family" = "OmniWeb" OR "Family" = "Other")') #c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE")') # just firefox and IE, good debug test # c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" != "Other")') uaslist = [] familylist = [] for row in c: uaslist.append([row[0], row[1]]) # stored as list of lists nbrowser = len(uaslist) random.shuffle(uaslist) familydict = {} for uaString in uaslist[0:int(round(frac * nbrowser, 0))]: tokens = [int(s) + 1 for s in uaString[0].split(" ")] tokdict = {} for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 family = uaString[1] familydict[family] = 1 trainingData.append((family, tokdict)) for uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) + 1]: tokens = [int(s) + 1 for s in uaString[0].split(" ")] tokdict = {} for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 family = uaString[1] familydict[family] = 1 testData.append((family, tokdict)) print 'Number browsers selected for training: ', int( round(frac * nbrowser, 0)) # just select first 5000 elements of train data and first 1000 elements of test data # random.shuffle(trainingData) # random.shuffle(testData) # trainingData = trainingData[0:1000] # testData = testData[0:200] print str(familydict) print len(familydict) # Make a Decision Problem params = LinearSVMParams() params.kernelName = "linear" params.C = C decProb = DecisionProblem("Family", params) y, x = decProb.makedata(trainingData) options = decProb.genoptions() print options model = svm_train(y, x, options) # If the model is not already generated, generate it # if not decProb.haveModel(): # decProb.generateModel(trainingData) # decProb.saveModel() # else: # decProb.loadModel() # Predict # FIXME need to build in confusion matrix for these multiclass problems filename = decProb.modelPath() print 'Filename: ', filename print 'saving model' svm_save_model(filename, model) labelfile = filename + '.labels' flabel = open(labelfile, 'w') for i, label in enumerate(set(decProb.labels)): s = label + ':' + str(i) + '\n' flabel.write(s) flabel.close() ytest, xtest = decProb.maketest(testData) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(ytest, xtest, model) correct = 0.0 total = 0.0 for i, actual in enumerate(ytest): #prediction = decProb.decide(ua) prediction = pred_labels[i] if (actual == prediction): correct = correct + 1.0 else: print 'actual: ', actual, 'predicted: ', prediction total = total + 1.0 print "ACCURACY: ", correct / total fname = decProb.modelPath() + ".results" f = open(fname, 'w') f.write(str(decProb.labelMap) + '\n') s = "ACCURACY: " + str(correct / total) + '\n' f.write(s) accuracy = correct / total return accuracy
def runSVMlinear(C=3.0, seednum=18283835): #initialize with seed so we get same training and test sets every time # i.e. for cross validation if (seednum == '-1'): random.seed() else: random.seed(seednum) # use this for standard validation #random.seed(8982101) #random.seed(12820501) # Throwaway test to classify browser vs bot frac = 0.8 # Open the database conn = sqlite3.connect("../mydb") conn.text_factory = str c = conn.cursor() # Data trainingData = [] testData = [] trainlist = [] testlist = [] # read # of tokens ftokens = open('../../data/tokens.txt', 'r') keys = [] for line in ftokens: s = line.strip().rstrip() token, sep, number = s.partition(' ') number = int(number) + 1 keys.append(number) print "NUMBER OF TOKENS: ", max(keys) maxtok = max(keys) print "NUM TOKENS USED ", maxtok # use first 500 tokens only # bots c.execute('select Tokens, uaString from data where "Type" = "Robot"') uaslist = [] for uaString in c: uaslist.append((uaString[0], uaString[1])) nrobot = len(uaslist) random.shuffle(uaslist) for Tok, uaString in uaslist[0:int(round(frac * nrobot, 0))]: tokens = [int(s) + 1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 trainingData.append(('Robot', tokdict)) trainlist.append(uaString) for Tok, uaString in uaslist[int(round(frac * nrobot, 0)):len(uaslist) + 1]: tokens = [int(s) + 1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 testData.append(('Robot', tokdict)) testlist.append(uaString) print 'Number robots selected for training: ', int(round(frac * nrobot, 0)) # browsers c.execute('select Tokens, uaString from data where "Type" = "Browser"') uaslist = [] for uaString in c: uaslist.append((uaString[0], uaString[1])) nbrowser = len(uaslist) random.shuffle(uaslist) for Tok, uaString in uaslist[0:int(round(frac * nbrowser, 0))]: tokens = [int(s) + 1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 trainingData.append(('Browser', tokdict)) trainlist.append(uaString) for Tok, uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) + 1]: tokens = [int(s) + 1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 # print tokdict testData.append(('Browser', tokdict)) testlist.append(uaString) print 'Number browsers selected for training: ', int( round(frac * nbrowser, 0)) # just select first 5000 elements of train data and first 1000 elements of test data # random.shuffle(trainingData) # random.shuffle(testData) # trainingData = trainingData[0:10000] # testData = testData[0:1000] # Make a Decision Problem params = LinearSVMParams() params.kernelName = "linear" params.C = C decProb = DecisionProblem("Type", params) y, x = decProb.makedata(trainingData) options = decProb.genoptions() print options model = svm_train(y, x, options) # If the model is not already generated, generate it # if not decProb.haveModel(): # decProb.generateModel(trainingData) # decProb.saveModel() # else: # decProb.loadModel() # get path for model filename = decProb.modelPath() print 'Filename: ', filename print 'saving model' svm_save_model(filename, model) labelfile = filename + '.labels' flabel = open(labelfile, 'w') for i, label in enumerate(set(decProb.labels)): s = label + ':' + str(i) + '\n' flabel.write(s) flabel.close() ytest, xtest = decProb.makedata(testData) pred_labels, (ACC, MSE, SCC), pred_values = svm_predict(ytest, xtest, model) # Predict correct = 0.0 total = 0.0 correct0 = 0.0 correct1 = 0.0 false1 = 0.0 false0 = 0.0 ind = 0 fmiss = open('misclass.txt', 'w') for i, actual in enumerate(ytest): # prediction = decProb.decide(ua) prediction = pred_labels[i] #if( actual == 'Robot'): if (actual == 0): if (actual == prediction): correct0 = correct0 + 1.0 correct = correct + 1.0 else: false1 = false1 + 1.0 fmiss.write('false Browser: ' + testlist[ind] + '\n') #elif( actual == 'Browser'): elif (actual == 1): if (actual == prediction): correct1 = correct1 + 1.0 correct = correct + 1.0 else: false0 = false0 + 1.0 fmiss.write('false Robot: ' + testlist[ind] + '\n') total = total + 1.0 ind = ind + 1 print "ACCURACY: ", correct / total print "False X means classifier said data was X, but it was actually something else" print "Correct Robot: ", correct0, "False Robot: ", false0 print "Correct Browser: ", correct1, "False Browser: ", false1 fname = decProb.modelPath() + ".results" f = open(fname, 'w') f.write(str(decProb.labelMap) + '\n') s = "ACCURACY: " + str(correct / total) + '\n' s = s + "False X means classifier said data was X, but it was actually something else\n" s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + '\n' s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + '\n' f.write(s) accuracy = correct / total return accuracy, correct0, false0, correct1, false1
def runSVMlinear(C=3.0,gamma=0.1,seednum= 18283835): #initialize with seed so we get same training and test sets every time # i.e. for cross validation if(seednum == '-1'): random.seed() else: random.seed(seednum) # use this for standard validation #random.seed(8982101) #random.seed(12820501) # Throwaway test to classify browser vs bot frac = 0.8 # Open the database conn = sqlite3.connect("../mydb") conn.text_factory = str c = conn.cursor() # Data trainingData = []; testData = []; trainlist = [] testlist = [] # read # of tokens ftokens = open('../../data/tokens.txt','r') keys = [] for line in ftokens: s = line.strip().rstrip() token, sep, number = s.partition(' ') number = int(number) + 1 keys.append(number) print "NUMBER OF TOKENS: ",max(keys) maxtok = max(keys) print "NUM TOKENS USED ", maxtok # use first 500 tokens only # bots c.execute('select Tokens, uaString from data where "Type" = "Robot"') uaslist = [] for uaString in c: uaslist.append((uaString[0],uaString[1])) nrobot = len(uaslist) random.shuffle(uaslist) for Tok, uaString in uaslist[0:int(round(frac*nrobot,0))]: tokens = [int(s)+1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] +=1.0 trainingData.append(('Robot', tokdict)) trainlist.append(uaString) for Tok, uaString in uaslist[int(round(frac*nrobot,0)):len(uaslist)+1]: tokens = [int(s)+1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] +=1.0 testData.append(('Robot', tokdict)) testlist.append(uaString) print 'Number robots selected for training: ', int(round(frac*nrobot,0)) # browsers c.execute('select Tokens, uaString from data where "Type" = "Browser"') uaslist = [] for uaString in c: uaslist.append((uaString[0],uaString[1])) nbrowser = len(uaslist) random.shuffle(uaslist) for Tok, uaString in uaslist[0:int(round(frac*nbrowser,0))]: tokens = [int(s)+1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] +=1.0 trainingData.append(('Browser', tokdict)) trainlist.append(uaString) for Tok, uaString in uaslist[int(round(frac*nbrowser,0)):len(uaslist)+1]: tokens = [int(s)+1 for s in Tok.split(" ")] tokdict = {} # for key in keys: tokdict[int(key)] = 0 for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] +=1.0 # print tokdict testData.append(('Browser', tokdict)) testlist.append(uaString) print 'Number browsers selected for training: ', int(round(frac*nbrowser,0)) # just select first 5000 elements of train data and first 1000 elements of test data # random.shuffle(trainingData) # random.shuffle(testData) # trainingData = trainingData[0:10000] # testData = testData[0:1000] # Make a Decision Problem params = LinearSVMParams() params.kernelName = "RBF" params.gamma = gamma params.C = C decProb = DecisionProblem("Type", params) y, x = decProb.makedata(trainingData) options = decProb.genoptions() print options model = svm_train(y,x,options) # If the model is not already generated, generate it # if not decProb.haveModel(): # decProb.generateModel(trainingData) # decProb.saveModel() # else: # decProb.loadModel() # get path for model filename = decProb.modelPath() print 'Filename: ', filename print 'saving model' svm_save_model(filename,model) labelfile = filename+'.labels' flabel = open(labelfile,'w') for i,label in enumerate(set(decProb.labels)): s = label + ':' + str(i) + '\n' flabel.write(s) flabel.close() ytest, xtest = decProb.makedata(testData) pred_labels, (ACC,MSE,SCC), pred_values = svm_predict(ytest,xtest,model) # Predict correct = 0.0 total = 0.0 correct0 = 0.0 correct1 = 0.0 false1 = 0.0 false0 = 0.0 ind = 0 fmiss = open('misclass.txt','w') for i, actual in enumerate(ytest): # prediction = decProb.decide(ua) prediction = pred_labels[i] #if( actual == 'Robot'): if (actual == 0): if(actual == prediction): correct0 = correct0 + 1.0 correct = correct + 1.0 else: false1 = false1 + 1.0 fmiss.write('false Browser: ' + testlist[ind]+'\n') #elif( actual == 'Browser'): elif (actual == 1): if(actual == prediction): correct1 = correct1 + 1.0 correct = correct+1.0 else: false0 = false0 + 1.0 fmiss.write('false Robot: ' + testlist[ind] +'\n') total = total + 1.0 ind = ind + 1 print "ACCURACY: ", correct / total print "False X means classifier said data was X, but it was actually something else" print "Correct Robot: ", correct0, "False Robot: ", false0 print "Correct Browser: ", correct1, "False Browser: ", false1 fname = decProb.modelPath() + ".results" f = open(fname,'w') f.write(str(decProb.labelMap)+'\n') s = "ACCURACY: " + str(correct/total) + '\n' s = s + "False X means classifier said data was X, but it was actually something else\n" s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + '\n' s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + '\n' f.write(s) accuracy = correct/total return accuracy,correct0,false0,correct1,false1
def runSVM(C,seednum=18283835): #initialize with seed so we get same training and test sets every time # i.e. for cross validation if seednum == '-1': random.seed() else: random.seed(seednum) # Throwaway test to classify browser vs bot frac = 0.8 # Open the database conn = sqlite3.connect("../mydb") conn.text_factory = str c = conn.cursor() # Data trainingData = []; testData = []; # read # of tokens ftokens = open('../../data/tokens.txt','r') keys = [] for line in ftokens: s = line.strip().rstrip() token, sep, number = s.partition(' ') number = int(number) + 1 keys.append(number) print 'NUMBER OF TOKENS: ', max(keys) maxtok = max(keys) print 'NUM TOKENS USED ', maxtok #get Families c.execute('select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL') # c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE" OR "Family" = "Chrome" OR "Family" = "Galeon" OR "Family" = "Konqueror" OR "Family" = "Opera" OR "Family" = "Safari" OR "Family" = "AOL Explorer" OR "FamilY" = "Maxthon" OR "Family" = "Avant Browser" OR "Family" = "IceWeasel" OR "Family" = "Mozilla" OR "Family" = "SeaMonkey" OR "Family" = "Netscape Navigator" OR "Family" = "Flock" OR "Family" = "Camino" OR "Family" = "Crazy Browser" OR "Family" = "CometBird" OR "Family" = "Epiphany" OR"Family" = "TheWorld Browser" OR "Family" = "K-Meleon" OR "Family" = "Sleipnir" OR "Family" = "Iron" OR "Family" = "Swiftfox" OR "Family" = "Acoo Browser" OR "Family" = "TT Explorer" OR "Family" = "GreenBrowser" OR "Family" = "Lunascape" OR "Family" = "OmniWeb" OR "Family" = "Other")') #c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE")') # just firefox and IE, good debug test # c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" != "Other")') uaslist = [] familylist = [] for row in c: uaslist.append([row[0],row[1]]) # stored as list of lists nbrowser = len(uaslist) random.shuffle(uaslist) familydict = {} for uaString in uaslist[0:int(round(frac*nbrowser,0))]: tokens = [int(s)+1 for s in uaString[0].split(" ")] tokdict = {} for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 family = uaString[1] familydict[family] = 1 trainingData.append((family, tokdict)) for uaString in uaslist[int(round(frac*nbrowser,0)):len(uaslist)+1]: tokens = [int(s)+1 for s in uaString[0].split(" ")] tokdict = {} for tok in tokens: if int(tok) < maxtok: try: tokdict[int(tok)] except KeyError: tokdict[int(tok)] = 1.0 else: tokdict[int(tok)] += 1.0 family = uaString[1] familydict[family] = 1 testData.append((family, tokdict)) print 'Number browsers selected for training: ', int(round(frac*nbrowser,0)) # just select first 5000 elements of train data and first 1000 elements of test data # random.shuffle(trainingData) # random.shuffle(testData) # trainingData = trainingData[0:1000] # testData = testData[0:200] print str(familydict) print len(familydict) # Make a Decision Problem params = LinearSVMParams() params.kernelName = "linear" params.C = C decProb = DecisionProblem("Family", params) y, x = decProb.makedata(trainingData) options = decProb.genoptions() print options model = svm_train(y,x,options) # If the model is not already generated, generate it # if not decProb.haveModel(): # decProb.generateModel(trainingData) # decProb.saveModel() # else: # decProb.loadModel() # Predict # FIXME need to build in confusion matrix for these multiclass problems filename = decProb.modelPath() print 'Filename: ', filename print 'saving model' svm_save_model(filename,model) labelfile = filename+'.labels' flabel = open(labelfile,'w') for i,label in enumerate(set(decProb.labels)): s = label + ':'+str(i)+'\n' flabel.write(s) flabel.close() ytest, xtest = decProb.maketest(testData) pred_labels, (ACC,MSE,SCC), pred_values = svm_predict(ytest,xtest,model) correct = 0.0 total = 0.0 for i, actual in enumerate(ytest): #prediction = decProb.decide(ua) prediction = pred_labels[i] if(actual == prediction): correct = correct + 1.0 else: print 'actual: ', actual, 'predicted: ',prediction total = total + 1.0 print "ACCURACY: ", correct / total fname = decProb.modelPath() + ".results" f = open(fname,'w') f.write(str(decProb.labelMap)+'\n') s = "ACCURACY: " + str(correct/total) + '\n' f.write(s) accuracy = correct/total return accuracy