示例#1
0
def runSVM(C, seednum=18283835):

    #initialize with seed so we get same training and test sets every time
    # i.e. for cross validation
    if seednum == '-1':
        random.seed()
    else:
        random.seed(seednum)


# Throwaway test to classify browser vs bot
    frac = 0.8

    # Open the database
    conn = sqlite3.connect("../mydb")
    conn.text_factory = str
    c = conn.cursor()

    # Data
    trainingData = []
    testData = []

    # read # of tokens
    ftokens = open('../../data/tokens.txt', 'r')
    keys = []
    for line in ftokens:
        s = line.strip().rstrip()
        token, sep, number = s.partition(' ')
        number = int(number) + 1
        keys.append(number)
    print 'NUMBER OF TOKENS: ', max(keys)
    maxtok = max(keys)
    print 'NUM TOKENS USED ', maxtok

    #get Families
    c.execute(
        'select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL'
    )
    #    c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE" OR "Family" = "Chrome" OR "Family" = "Galeon" OR "Family" = "Konqueror" OR "Family" = "Opera" OR "Family" = "Safari" OR "Family" = "AOL Explorer" OR "FamilY" = "Maxthon" OR "Family" = "Avant Browser" OR "Family" = "IceWeasel" OR "Family" = "Mozilla" OR "Family" = "SeaMonkey" OR "Family" = "Netscape Navigator" OR "Family" = "Flock" OR "Family" = "Camino" OR "Family" = "Crazy Browser" OR "Family" = "CometBird" OR "Family" = "Epiphany" OR"Family" = "TheWorld Browser" OR "Family" = "K-Meleon" OR "Family" = "Sleipnir" OR "Family" = "Iron" OR "Family" = "Swiftfox" OR "Family" = "Acoo Browser" OR "Family" = "TT Explorer" OR "Family" = "GreenBrowser" OR "Family" = "Lunascape" OR "Family" = "OmniWeb" OR "Family" = "Other")')
    #c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE")') # just firefox and IE, good debug test
    #    c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" != "Other")')
    uaslist = []
    familylist = []
    for row in c:
        uaslist.append([row[0], row[1]])  # stored as list of lists
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    familydict = {}
    for uaString in uaslist[0:int(round(frac * nbrowser, 0))]:
        tokens = [int(s) + 1 for s in uaString[0].split(" ")]
        tokdict = {}
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        family = uaString[1]
        familydict[family] = 1
        trainingData.append((family, tokdict))
    for uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) + 1]:
        tokens = [int(s) + 1 for s in uaString[0].split(" ")]
        tokdict = {}
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        family = uaString[1]
        familydict[family] = 1
        testData.append((family, tokdict))
    print 'Number browsers selected for training: ', int(
        round(frac * nbrowser, 0))

    # just select first 5000 elements of train data and first 1000 elements of test data
    #    random.shuffle(trainingData)
    #    random.shuffle(testData)
    #    trainingData = trainingData[0:1000]
    #    testData = testData[0:200]
    print str(familydict)
    print len(familydict)

    # Make a Decision Problem
    params = LinearSVMParams()
    params.kernelName = "linear"
    params.C = C
    decProb = DecisionProblem("Family", params)
    y, x = decProb.makedata(trainingData)
    options = decProb.genoptions()
    print options
    model = svm_train(y, x, options)

    # If the model is not already generated, generate it
    #    if not decProb.haveModel():
    #        decProb.generateModel(trainingData)
    #        decProb.saveModel()
    #    else:
    #        decProb.loadModel()

    # Predict
    # FIXME need to build in confusion matrix for these multiclass problems
    filename = decProb.modelPath()
    print 'Filename: ', filename
    print 'saving model'
    svm_save_model(filename, model)
    labelfile = filename + '.labels'
    flabel = open(labelfile, 'w')
    for i, label in enumerate(set(decProb.labels)):
        s = label + ':' + str(i) + '\n'
        flabel.write(s)

    flabel.close()
    ytest, xtest = decProb.maketest(testData)
    pred_labels, (ACC, MSE,
                  SCC), pred_values = svm_predict(ytest, xtest, model)

    correct = 0.0
    total = 0.0
    for i, actual in enumerate(ytest):
        #prediction = decProb.decide(ua)
        prediction = pred_labels[i]
        if (actual == prediction):
            correct = correct + 1.0
        else:
            print 'actual: ', actual, 'predicted: ', prediction
        total = total + 1.0

    print "ACCURACY: ", correct / total

    fname = decProb.modelPath() + ".results"
    f = open(fname, 'w')
    f.write(str(decProb.labelMap) + '\n')
    s = "ACCURACY: " + str(correct / total) + '\n'
    f.write(s)
    accuracy = correct / total
    return accuracy
示例#2
0
def runSVMlinear(C=3.0, seednum=18283835):

    #initialize with seed so we get same training and test sets every time
    # i.e. for cross validation
    if (seednum == '-1'):
        random.seed()
    else:
        random.seed(seednum)  # use this for standard validation
        #random.seed(8982101)
        #random.seed(12820501)

# Throwaway test to classify browser vs bot
    frac = 0.8

    # Open the database
    conn = sqlite3.connect("../mydb")
    conn.text_factory = str
    c = conn.cursor()

    # Data
    trainingData = []
    testData = []

    trainlist = []
    testlist = []

    # read # of tokens
    ftokens = open('../../data/tokens.txt', 'r')
    keys = []
    for line in ftokens:
        s = line.strip().rstrip()
        token, sep, number = s.partition(' ')
        number = int(number) + 1
        keys.append(number)
    print "NUMBER OF TOKENS: ", max(keys)
    maxtok = max(keys)
    print "NUM TOKENS USED ", maxtok
    # use first 500 tokens only
    # bots
    c.execute('select Tokens, uaString from data where "Type" = "Robot"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0], uaString[1]))
    nrobot = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac * nrobot, 0))]:
        tokens = [int(s) + 1 for s in Tok.split(" ")]
        tokdict = {}
        #      for key in keys: tokdict[int(key)] = 0
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        trainingData.append(('Robot', tokdict))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac * nrobot, 0)):len(uaslist) +
                                 1]:
        tokens = [int(s) + 1 for s in Tok.split(" ")]
        tokdict = {}
        #      for key in keys: tokdict[int(key)] = 0
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        testData.append(('Robot', tokdict))
        testlist.append(uaString)
    print 'Number robots selected for training: ', int(round(frac * nrobot, 0))

    # browsers
    c.execute('select Tokens, uaString from data where "Type" = "Browser"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0], uaString[1]))
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac * nbrowser, 0))]:
        tokens = [int(s) + 1 for s in Tok.split(" ")]
        tokdict = {}
        #        for key in keys: tokdict[int(key)] = 0
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        trainingData.append(('Browser', tokdict))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) +
                                 1]:
        tokens = [int(s) + 1 for s in Tok.split(" ")]
        tokdict = {}
        #       for key in keys: tokdict[int(key)] = 0
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0


#        print tokdict
        testData.append(('Browser', tokdict))
        testlist.append(uaString)
    print 'Number browsers selected for training: ', int(
        round(frac * nbrowser, 0))

    # just select first 5000 elements of train data and first 1000 elements of test data
    #  random.shuffle(trainingData)
    #  random.shuffle(testData)
    #  trainingData = trainingData[0:10000]
    #  testData = testData[0:1000]

    # Make a Decision Problem
    params = LinearSVMParams()
    params.kernelName = "linear"
    params.C = C
    decProb = DecisionProblem("Type", params)
    y, x = decProb.makedata(trainingData)
    options = decProb.genoptions()
    print options
    model = svm_train(y, x, options)
    # If the model is not already generated, generate it
    #    if not decProb.haveModel():
    #        decProb.generateModel(trainingData)
    #        decProb.saveModel()
    #    else:
    #        decProb.loadModel()

    # get path for model
    filename = decProb.modelPath()
    print 'Filename: ', filename
    print 'saving model'
    svm_save_model(filename, model)
    labelfile = filename + '.labels'
    flabel = open(labelfile, 'w')
    for i, label in enumerate(set(decProb.labels)):
        s = label + ':' + str(i) + '\n'
        flabel.write(s)

    flabel.close()
    ytest, xtest = decProb.makedata(testData)
    pred_labels, (ACC, MSE,
                  SCC), pred_values = svm_predict(ytest, xtest, model)
    # Predict
    correct = 0.0
    total = 0.0
    correct0 = 0.0
    correct1 = 0.0
    false1 = 0.0
    false0 = 0.0
    ind = 0
    fmiss = open('misclass.txt', 'w')
    for i, actual in enumerate(ytest):
        #        prediction = decProb.decide(ua)
        prediction = pred_labels[i]
        #if( actual == 'Robot'):
        if (actual == 0):
            if (actual == prediction):
                correct0 = correct0 + 1.0
                correct = correct + 1.0
            else:
                false1 = false1 + 1.0
                fmiss.write('false Browser: ' + testlist[ind] + '\n')
        #elif( actual == 'Browser'):
        elif (actual == 1):
            if (actual == prediction):
                correct1 = correct1 + 1.0
                correct = correct + 1.0
            else:
                false0 = false0 + 1.0
                fmiss.write('false Robot: ' + testlist[ind] + '\n')

        total = total + 1.0
        ind = ind + 1

    print "ACCURACY: ", correct / total
    print "False X means classifier said data was X, but it was actually something else"
    print "Correct Robot: ", correct0, "False Robot: ", false0
    print "Correct Browser: ", correct1, "False Browser: ", false1

    fname = decProb.modelPath() + ".results"
    f = open(fname, 'w')
    f.write(str(decProb.labelMap) + '\n')
    s = "ACCURACY: " + str(correct / total) + '\n'
    s = s + "False X means classifier said data was X, but it was actually something else\n"
    s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + '\n'
    s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + '\n'
    f.write(s)
    accuracy = correct / total
    return accuracy, correct0, false0, correct1, false1
示例#3
0
def runSVMlinear(C=3.0,gamma=0.1,seednum= 18283835):

#initialize with seed so we get same training and test sets every time
# i.e. for cross validation
    if(seednum == '-1'):
        random.seed()
    else:    
        random.seed(seednum) # use this for standard validation
        #random.seed(8982101)
        #random.seed(12820501)

# Throwaway test to classify browser vs bot
    frac = 0.8

# Open the database
    conn = sqlite3.connect("../mydb")
    conn.text_factory = str
    c = conn.cursor()

# Data
    trainingData = [];
    testData = [];

    trainlist = []
    testlist = []

# read # of tokens 
    ftokens = open('../../data/tokens.txt','r')
    keys = []
    for line in ftokens:
        s = line.strip().rstrip()
        token, sep, number = s.partition(' ')
        number = int(number) + 1
        keys.append(number)
    print "NUMBER OF TOKENS: ",max(keys)    
    maxtok = max(keys)
    print "NUM TOKENS USED ", maxtok
# use first 500 tokens only
# bots
    c.execute('select Tokens, uaString from data where "Type" = "Robot"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0],uaString[1]))
    nrobot = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac*nrobot,0))]:
        tokens = [int(s)+1 for s in Tok.split(" ")]
        tokdict = {}
  #      for key in keys: tokdict[int(key)] = 0
        for tok in tokens: 
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] +=1.0
        trainingData.append(('Robot', tokdict))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac*nrobot,0)):len(uaslist)+1]:
        tokens = [int(s)+1 for s in Tok.split(" ")]
        tokdict = {}
  #      for key in keys: tokdict[int(key)] = 0
        for tok in tokens: 
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] +=1.0
        testData.append(('Robot', tokdict))
        testlist.append(uaString)
    print 'Number robots selected for training: ', int(round(frac*nrobot,0))

# browsers
    c.execute('select Tokens, uaString from data where "Type" = "Browser"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0],uaString[1]))
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac*nbrowser,0))]:
        tokens = [int(s)+1 for s in Tok.split(" ")]
        tokdict = {}
#        for key in keys: tokdict[int(key)] = 0
        for tok in tokens: 
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] +=1.0
        trainingData.append(('Browser', tokdict))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac*nbrowser,0)):len(uaslist)+1]:
        tokens = [int(s)+1 for s in Tok.split(" ")]
        tokdict = {}
 #       for key in keys: tokdict[int(key)] = 0
        for tok in tokens: 
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] +=1.0
#        print tokdict    
        testData.append(('Browser', tokdict))
        testlist.append(uaString)
    print 'Number browsers selected for training: ', int(round(frac*nbrowser,0))    
    



# just select first 5000 elements of train data and first 1000 elements of test data
  #  random.shuffle(trainingData)    
  #  random.shuffle(testData)
  #  trainingData = trainingData[0:10000]
  #  testData = testData[0:1000]

# Make a Decision Problem
    params = LinearSVMParams()
    params.kernelName = "RBF"
    params.gamma = gamma
    params.C = C
    decProb = DecisionProblem("Type", params)
    y, x = decProb.makedata(trainingData)
    options = decProb.genoptions()
    print options
    model = svm_train(y,x,options) 
# If the model is not already generated, generate it
#    if not decProb.haveModel():
#        decProb.generateModel(trainingData)
#        decProb.saveModel()
#    else:
#        decProb.loadModel()

# get path for model
    filename = decProb.modelPath()
    print 'Filename: ', filename
    print 'saving model'
    svm_save_model(filename,model)
    labelfile = filename+'.labels'
    flabel = open(labelfile,'w')
    for i,label in enumerate(set(decProb.labels)):
        s = label + ':' + str(i) + '\n'
        flabel.write(s)

    flabel.close()
    ytest, xtest = decProb.makedata(testData)
    pred_labels, (ACC,MSE,SCC), pred_values = svm_predict(ytest,xtest,model)
# Predict
    correct = 0.0
    total = 0.0
    correct0 = 0.0
    correct1 = 0.0
    false1 = 0.0
    false0 = 0.0
    ind = 0
    fmiss = open('misclass.txt','w')
    for i, actual  in enumerate(ytest):
#        prediction = decProb.decide(ua)
        prediction = pred_labels[i]
        #if( actual == 'Robot'):
        if (actual == 0):
            if(actual == prediction):
                correct0 = correct0 + 1.0
                correct = correct + 1.0
            else:
                false1 = false1 + 1.0
                fmiss.write('false Browser: ' + testlist[ind]+'\n')
        #elif( actual == 'Browser'):
        elif (actual == 1):
            if(actual == prediction):
                correct1 = correct1 + 1.0
                correct = correct+1.0
            else:
                false0 = false0 + 1.0
                fmiss.write('false Robot: ' + testlist[ind] +'\n')
    
        total = total + 1.0
        ind = ind + 1

    print "ACCURACY: ", correct / total
    print "False X means classifier said data was X, but it was actually something else"
    print "Correct Robot: ", correct0, "False Robot: ", false0
    print "Correct Browser: ", correct1, "False Browser: ", false1

    fname = decProb.modelPath() + ".results"
    f = open(fname,'w')
    f.write(str(decProb.labelMap)+'\n')
    s = "ACCURACY: " + str(correct/total) + '\n'
    s = s + "False X means classifier said data was X, but it was actually something else\n"
    s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + '\n'
    s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + '\n'
    f.write(s)
    accuracy = correct/total    
    return accuracy,correct0,false0,correct1,false1
示例#4
0
def runSVM(C,seednum=18283835):

#initialize with seed so we get same training and test sets every time
# i.e. for cross validation
    if seednum == '-1':
        random.seed()
    else:
        random.seed(seednum)

# Throwaway test to classify browser vs bot
    frac = 0.8

# Open the database
    conn = sqlite3.connect("../mydb")
    conn.text_factory = str
    c = conn.cursor()

# Data
    trainingData = [];
    testData = [];

# read # of tokens
    ftokens = open('../../data/tokens.txt','r')
    keys = []
    for line in ftokens:
        s = line.strip().rstrip()
        token, sep, number = s.partition(' ')
        number = int(number) + 1
        keys.append(number)
    print 'NUMBER OF TOKENS: ', max(keys)
    maxtok = max(keys)
    print 'NUM TOKENS USED ', maxtok

#get Families
    c.execute('select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL')
#    c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE" OR "Family" = "Chrome" OR "Family" = "Galeon" OR "Family" = "Konqueror" OR "Family" = "Opera" OR "Family" = "Safari" OR "Family" = "AOL Explorer" OR "FamilY" = "Maxthon" OR "Family" = "Avant Browser" OR "Family" = "IceWeasel" OR "Family" = "Mozilla" OR "Family" = "SeaMonkey" OR "Family" = "Netscape Navigator" OR "Family" = "Flock" OR "Family" = "Camino" OR "Family" = "Crazy Browser" OR "Family" = "CometBird" OR "Family" = "Epiphany" OR"Family" = "TheWorld Browser" OR "Family" = "K-Meleon" OR "Family" = "Sleipnir" OR "Family" = "Iron" OR "Family" = "Swiftfox" OR "Family" = "Acoo Browser" OR "Family" = "TT Explorer" OR "Family" = "GreenBrowser" OR "Family" = "Lunascape" OR "Family" = "OmniWeb" OR "Family" = "Other")')
    #c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" = "Firefox" OR "Family" = "IE")') # just firefox and IE, good debug test
#    c.execute('select Tokens, Family from data where "Type" = "Browser" AND ("Family" != "Other")')
    uaslist = []
    familylist = []
    for row in c:
        uaslist.append([row[0],row[1]]) # stored as list of lists
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    familydict = {}
    for uaString in uaslist[0:int(round(frac*nbrowser,0))]:
        tokens = [int(s)+1 for s in uaString[0].split(" ")]
        tokdict = {}
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        family = uaString[1] 
        familydict[family] = 1
        trainingData.append((family, tokdict))
    for uaString in uaslist[int(round(frac*nbrowser,0)):len(uaslist)+1]:
        tokens = [int(s)+1 for s in uaString[0].split(" ")]
        tokdict = {}
        for tok in tokens:
            if int(tok) < maxtok:
                try:
                    tokdict[int(tok)]
                except KeyError:
                    tokdict[int(tok)] = 1.0
                else:
                    tokdict[int(tok)] += 1.0
        family = uaString[1]
        familydict[family] = 1
        testData.append((family, tokdict))
    print 'Number browsers selected for training: ', int(round(frac*nbrowser,0))    
    
# just select first 5000 elements of train data and first 1000 elements of test data
#    random.shuffle(trainingData)    
#    random.shuffle(testData)
#    trainingData = trainingData[0:1000]
#    testData = testData[0:200]
    print str(familydict)
    print len(familydict)

# Make a Decision Problem
    params = LinearSVMParams()
    params.kernelName = "linear"
    params.C = C
    decProb = DecisionProblem("Family", params)
    y, x = decProb.makedata(trainingData)
    options = decProb.genoptions()
    print options
    model = svm_train(y,x,options)

# If the model is not already generated, generate it
#    if not decProb.haveModel():
#        decProb.generateModel(trainingData)
#        decProb.saveModel()
#    else:
#        decProb.loadModel()

# Predict
# FIXME need to build in confusion matrix for these multiclass problems
    filename = decProb.modelPath()
    print 'Filename: ', filename
    print 'saving model'
    svm_save_model(filename,model)
    labelfile = filename+'.labels'
    flabel = open(labelfile,'w')
    for i,label in enumerate(set(decProb.labels)):
        s = label + ':'+str(i)+'\n'
        flabel.write(s)
    
    flabel.close()
    ytest, xtest = decProb.maketest(testData)
    pred_labels, (ACC,MSE,SCC), pred_values = svm_predict(ytest,xtest,model)

    correct = 0.0
    total = 0.0
    for i, actual in enumerate(ytest):
        #prediction = decProb.decide(ua)
        prediction = pred_labels[i]
        if(actual == prediction):
            correct = correct + 1.0
        else:
            print 'actual: ', actual, 'predicted: ',prediction
        total = total + 1.0

    print "ACCURACY: ", correct / total

    fname = decProb.modelPath() + ".results"
    f = open(fname,'w')
    f.write(str(decProb.labelMap)+'\n')
    s = "ACCURACY: " + str(correct/total) + '\n'
    f.write(s)
    accuracy = correct/total    
    return accuracy