示例#1
0
def runSVMedit(C, gamma):

    # initialize with seed so we get same training and test sets every time
    # i.e. for cross validation
    random.seed(18283835)  # use this for standard validation
    #    random.seed(8982101)
    #    random.seed(12820501)

    # Throwaway test to classify browser vs bot
    frac = 0.8

    # Open the database
    conn = sqlite3.connect("mydb")
    conn.text_factory = str
    c = conn.cursor()

    # Data
    trainingData = []
    testData = []

    trainlist = []
    testlist = []
    # bots
    c.execute('select uaString from data where "Type" = "Robot"')
    uaslist = []
    for uaString in c:
        uaslist.append(uaString[0])
    nrobot = len(uaslist)
    random.shuffle(uaslist)
    for uaString in uaslist[0 : int(round(frac * nrobot, 0))]:
        trainingData.append(("Robot", uaString))
        trainlist.append(uaString)
    for uaString in uaslist[int(round(frac * nrobot, 0)) : len(uaslist) + 1]:
        testData.append(("Robot", uaString))
        testlist.append(uaString)
    print "Number robots selected for training: ", int(round(frac * nrobot, 0))

    # browsers
    c.execute('select uaString from data where "Type" = "Browser"')
    uaslist = []
    for uaString in c:
        uaslist.append(uaString[0])
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for uaString in uaslist[0 : int(round(frac * nbrowser, 0))]:
        trainingData.append(("Browser", uaString))
        trainlist.append(uaString)
    for uaString in uaslist[int(round(frac * nbrowser, 0)) : len(uaslist) + 1]:
        testData.append(("Browser", uaString))
        testlist.append(uaString)
    print "Number browsers selected for training: ", int(round(frac * nbrowser, 0))

    # just select first 5000 elements of train data and first 1000 elements of test data
    #    random.shuffle(trainingData)
    #    random.shuffle(testData)
    #    trainingData = trainingData[0:1000]
    #    testData = testData[0:200]

    # Make a Decision Problem
    params = SVMParams()
    params.kernelName = "edit"
    params.tokenized = False
    params.C = C
    params.gamma = gamma
    decProb = DecisionProblem("Type", params)

    # If the model is not already generated, generate it
    if not decProb.haveModel():
        decProb.generateModel(trainingData)
        decProb.saveModel()
    else:
        decProb.loadModel()

    # Predict
    correct = 0.0
    total = 0.0
    correct0 = 0.0
    correct1 = 0.0
    false1 = 0.0
    false0 = 0.0
    ind = 0
    fmiss = open("misclass.txt", "w")
    for actual, ua in testData:
        prediction = decProb.decide(ua)
        if actual == "Robot":
            if actual == prediction:
                correct0 = correct0 + 1.0
                correct = correct + 1.0
            else:
                false1 = false1 + 1.0
                fmiss.write("false Browser: " + testlist[ind] + "\n")
        elif actual == "Browser":
            if actual == prediction:
                correct1 = correct1 + 1.0
                correct = correct + 1.0
            else:
                false0 = false0 + 1.0
                fmiss.write("false Robot: " + testlist[ind] + "\n")

        total = total + 1.0
        ind = ind + 1

    print "ACCURACY: ", correct / total
    print "False X means classifier said data was X, but it was actually something else"
    print "Correct Robot: ", correct0, "False Robot: ", false0
    print "Correct Browser: ", correct1, "False Browser: ", false1

    fname = decProb.modelPath() + ".results"
    f = open(fname, "w")
    f.write(str(decProb.svm.labelMap) + "\n")
    s = "ACCURACY: " + str(correct / total) + "\n"
    s = s + "False X means classifier said data was X, but it was actually something else\n"
    s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + "\n"
    s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + "\n"
    f.write(s)
    accuracy = correct / total
    return accuracy, correct0, false0, correct1, false1
示例#2
0
def runSVM(C,gamma):

#initialize with seed so we get same training and test sets every time
# i.e. for cross validation
    random.seed(18283835)

# Throwaway test to classify browser vs bot
    frac = 0.8

# Open the database
    conn = sqlite3.connect("mydb")
    conn.text_factory = str
    c = conn.cursor()

# Data
    trainingData = [];
    testData = [];

#get Families
    c.execute('select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL')
    uaslist = []
    familylist = []
    for row in c:
        uaslist.append([row[0],row[1]]) # stored as list of lists
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for uaString in uaslist[0:int(round(frac*nbrowser,0))]:
        tokens = [int(s) for s in uaString[0].split(" ")]
        family = uaString[1] 
        trainingData.append((family, tokens))
    for uaString in uaslist[int(round(frac*nbrowser,0)):len(uaslist)+1]:
        tokens = [int(s) for s in uaString[0].split(" ")]
        family = uaString[1]
        testData.append((family, tokens))
    print 'Number browsers selected for training: ', int(round(frac*nbrowser,0))    
    
# just select first 5000 elements of train data and first 1000 elements of test data
#    random.shuffle(trainingData)    
#    random.shuffle(testData)
#    trainingData = trainingData[0:1000]
#    testData = testData[0:200]


# Make a Decision Problem
    params = SVMParams()
    params.kernelName = "edit"
    params.tokenized = True
    params.C = C
    params.gamma = gamma
    decProb = DecisionProblem("Family", params)

# If the model is not already generated, generate it
    if not decProb.haveModel():
        decProb.generateModel(trainingData)
        decProb.saveModel()
    else:
        decProb.loadModel()

# Predict
# FIXME need to build in confusion matrix for these multiclass problems
    correct = 0.0
    total = 0.0
    for actual, ua in testData:
        prediction = decProb.decide(ua)
        if(actual == prediction):
            correct = correct + 1.0
        total = total + 1.0

    print "ACCURACY: ", correct / total

    fname = decProb.modelPath() + ".results"
    f = open(fname,'w')
    f.write(str(decProb.svm.labelMap)+'\n')
    s = "ACCURACY: " + str(correct/total) + '\n'
    f.write(s)
    accuracy = correct/total    
    return accuracy
示例#3
0
def runSVM(C, seqLen, seqLambda):

    #initialize with seed so we get same training and test sets every time
    # i.e. for cross validation
    random.seed(18283835)  # use this for standard validation
    #    random.seed(8982101)
    #    random.seed(12820501)

    # Throwaway test to classify browser vs bot
    frac = 0.8

    # Open the database
    conn = sqlite3.connect("mydb")
    conn.text_factory = str
    c = conn.cursor()

    # Data
    trainingData = []
    testData = []

    trainlist = []
    testlist = []
    # bots
    c.execute('select Tokens, uaString from data where "Type" = "Robot"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0], uaString[1]))
    nrobot = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac * nrobot, 0))]:
        tokens = [int(s) for s in Tok.split(" ")]
        trainingData.append(('Robot', tokens))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac * nrobot, 0)):len(uaslist) +
                                 1]:
        tokens = [int(s) for s in Tok.split(" ")]
        testData.append(('Robot', tokens))
        testlist.append(uaString)
    print 'Number robots selected for training: ', int(round(frac * nrobot, 0))

    # browsers
    c.execute('select Tokens, uaString from data where "Type" = "Browser"')
    uaslist = []
    for uaString in c:
        uaslist.append((uaString[0], uaString[1]))
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for Tok, uaString in uaslist[0:int(round(frac * nbrowser, 0))]:
        tokens = [int(s) for s in Tok.split(" ")]
        trainingData.append(('Browser', tokens))
        trainlist.append(uaString)
    for Tok, uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) +
                                 1]:
        tokens = [int(s) for s in Tok.split(" ")]
        testData.append(('Browser', tokens))
        testlist.append(uaString)
    print 'Number browsers selected for training: ', int(
        round(frac * nbrowser, 0))

    # just select first 5000 elements of train data and first 1000 elements of test data
    #    random.shuffle(trainingData)
    #    random.shuffle(testData)
    #    trainingData = trainingData[0:1000]
    #    testData = testData[0:200]

    # Make a Decision Problem
    params = SVMParams()
    params.kernelName = "subseq"
    params.dataType = "tokens"
    params.C = C
    params.seqLen = seqLen
    params.seqLambda = seqLambda
    decProb = DecisionProblem("Type", params)

    # If the model is not already generated, generate it
    if not decProb.haveModel():
        decProb.generateModel(trainingData)
        decProb.saveModel()
    else:
        decProb.loadModel()


# Predict
    correct = 0.0
    total = 0.0
    correct0 = 0.0
    correct1 = 0.0
    false1 = 0.0
    false0 = 0.0
    ind = 0
    fmiss = open('misclass.txt', 'w')
    for actual, ua in testData:
        prediction = decProb.decide(ua)
        if (actual == 'Robot'):
            if (actual == prediction):
                correct0 = correct0 + 1.0
                correct = correct + 1.0
            else:
                false1 = false1 + 1.0
                fmiss.write('false Browser: ' + testlist[ind] + '\n')
        elif (actual == 'Browser'):
            if (actual == prediction):
                correct1 = correct1 + 1.0
                correct = correct + 1.0
            else:
                false0 = false0 + 1.0
                fmiss.write('false Robot: ' + testlist[ind] + '\n')

        total = total + 1.0
        ind = ind + 1

    print "ACCURACY: ", correct / total
    print "False X means classifier said data was X, but it was actually something else"
    print "Correct Robot: ", correct0, "False Robot: ", false0
    print "Correct Browser: ", correct1, "False Browser: ", false1

    fname = decProb.modelPath() + ".results"
    f = open(fname, 'w')
    f.write(str(decProb.svm.labelMap) + '\n')
    s = "ACCURACY: " + str(correct / total) + '\n'
    s = s + "False X means classifier said data was X, but it was actually something else\n"
    s = s + "Correct 0: " + str(correct0) + " False 0: " + str(false0) + '\n'
    s = s + "Correct 1: " + str(correct1) + " False 1: " + str(false1) + '\n'
    f.write(s)
    accuracy = correct / total
    return accuracy, correct0, false0, correct1, false1
示例#4
0
def runSVM(C, gamma):

    #initialize with seed so we get same training and test sets every time
    # i.e. for cross validation
    random.seed(18283835)

    # Throwaway test to classify browser vs bot
    frac = 0.8

    # Open the database
    conn = sqlite3.connect("mydb")
    conn.text_factory = str
    c = conn.cursor()

    # Data
    trainingData = []
    testData = []

    #get Families
    c.execute(
        'select Tokens, Family from data where "Type" = "Browser" AND "Family" IS NOT NULL'
    )
    uaslist = []
    familylist = []
    for row in c:
        uaslist.append([row[0], row[1]])  # stored as list of lists
    nbrowser = len(uaslist)
    random.shuffle(uaslist)
    for uaString in uaslist[0:int(round(frac * nbrowser, 0))]:
        tokens = [int(s) for s in uaString[0].split(" ")]
        family = uaString[1]
        trainingData.append((family, tokens))
    for uaString in uaslist[int(round(frac * nbrowser, 0)):len(uaslist) + 1]:
        tokens = [int(s) for s in uaString[0].split(" ")]
        family = uaString[1]
        testData.append((family, tokens))
    print 'Number browsers selected for training: ', int(
        round(frac * nbrowser, 0))

    # just select first 5000 elements of train data and first 1000 elements of test data
    #    random.shuffle(trainingData)
    #    random.shuffle(testData)
    #    trainingData = trainingData[0:1000]
    #    testData = testData[0:200]

    # Make a Decision Problem
    params = SVMParams()
    params.kernelName = "edit"
    params.tokenized = True
    params.C = C
    params.gamma = gamma
    decProb = DecisionProblem("Family", params)

    # If the model is not already generated, generate it
    if not decProb.haveModel():
        decProb.generateModel(trainingData)
        decProb.saveModel()
    else:
        decProb.loadModel()


# Predict
# FIXME need to build in confusion matrix for these multiclass problems
    correct = 0.0
    total = 0.0
    for actual, ua in testData:
        prediction = decProb.decide(ua)
        if (actual == prediction):
            correct = correct + 1.0
        total = total + 1.0

    print "ACCURACY: ", correct / total

    fname = decProb.modelPath() + ".results"
    f = open(fname, 'w')
    f.write(str(decProb.svm.labelMap) + '\n')
    s = "ACCURACY: " + str(correct / total) + '\n'
    f.write(s)
    accuracy = correct / total
    return accuracy