예제 #1
0
def generateKaggleSubmission(weights, outfilename):
    with open(outfilename, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=['Id', 'Prediction'])
        writer.writeheader()

        # Test data is used for private leaderboard
        testData = dataIterator(TESTKEY, test_mode=True)
        for i, (counts, _) in enumerate(testData):
            predictedLabel, _ = predict(counts, weights, ALL_LABELS)
            predictedIndex = ALL_LABELS.index(predictedLabel)
            writer.writerow({
                'Id': 'test-{}'.format(i),
                'Prediction': predictedIndex})

        # Dev data is used for public leaderboard
        devData = dataIterator(DEVKEY, test_mode=False)
        devCorrect = 0
        devTotal = 0
        for i, (counts, label) in enumerate(devData):
            devTotal += 1
            predictedLabel, _ = predict(counts, weights, ALL_LABELS)
            devCorrect += (predictedLabel == label)
            predictedIndex = ALL_LABELS.index(predictedLabel)
            writer.writerow({
                'Id': 'dev-{}'.format(i),
                'Prediction': predictedIndex})

    devAccuracy = float(devCorrect) / devTotal
    print 'Dev accuracy is ', devAccuracy, '({} correct of {})'.format(devCorrect, devTotal)
    print 'Kaggle submission saved to', outfilename, ('. Sanity check: '
                                                      'public leaderboard accuracy should be '), devAccuracy, 'on submission.'
예제 #2
0
def generateKaggleSubmission(weights,outfilename):
    with open(outfilename, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=['Id', 'Prediction'])
        writer.writeheader()

        # Test data is used for private leaderboard
        testData = dataIterator(TESTKEY,test_mode=True)
        for i,(counts,_) in enumerate(testData):
            predictedLabel,_ = predict(counts,weights,ALL_LABELS)
            predictedIndex = ALL_LABELS.index(predictedLabel)
            writer.writerow({
                'Id': 'test-{}'.format(i),
                'Prediction': predictedIndex})

        # Dev data is used for public leaderboard
        devData = dataIterator(DEVKEY,test_mode=False)
        devCorrect = 0
        devTotal = 0
        for i,(counts,label) in enumerate(devData):
            devTotal += 1
            predictedLabel,_ = predict(counts,weights,ALL_LABELS)
            devCorrect += (predictedLabel == label)
            predictedIndex = ALL_LABELS.index(predictedLabel)
            writer.writerow({
                'Id': 'dev-{}'.format(i),
                'Prediction': predictedIndex})
    
    devAccuracy = float(devCorrect) / devTotal
    print 'Dev accuracy is ', devAccuracy, '({} correct of {})'.format(devCorrect, devTotal)
    print 'Kaggle submission saved to', outfilename, ('. Sanity check: '
        'public leaderboard accuracy should be '), devAccuracy, 'on submission.'
예제 #3
0
def setup_module():
    # Need to do this because the dataIterator function depends
    # on the BOW file to be generated.
    global ac_train
    global ac_dev
    docsToBOWs(TRAINKEY)
    docsToBOWs(DEVKEY)
    ac_train = getAllCounts(dataIterator(TRAINKEY))
    ac_dev = getAllCounts(dataIterator(DEVKEY))
예제 #4
0
def setup_module():
    # Need to do this because the dataIterator function depends
    # on the BOW file to be generated.
    global ac_train
    global ac_dev
    docsToBOWs(TRAINKEY)
    docsToBOWs(DEVKEY)
    ac_train = getAllCounts (dataIterator (TRAINKEY))
    ac_dev = getAllCounts (dataIterator (DEVKEY))
예제 #5
0
def evalClassifier(weights, outfilename, testfile, test_mode=False):
    with open(outfilename, 'w') as outfile:
        for counts, label in dataIterator(testfile, test_mode):  # iterate through eval set
            print >> outfile, predict(counts, weights, ALL_LABELS)[0]  # print prediction to file
    if test_mode:
        return
    else:
        return gtnlplib.scorer.getConfusion(testfile, outfilename)  # run the scorer on the prediction file
예제 #6
0
def evalClassifier(weights,outfilename,testfile,test_mode=False):    
    with open(outfilename,'w') as outfile:
        for counts,label in dataIterator(testfile,test_mode): #iterate through eval set
            print >>outfile, predict(counts,weights,ALL_LABELS)[0] #print prediction to file
    if test_mode:
        return
    else:
        return gtnlplib.scorer.getConfusion(testfile,outfilename) #run the scorer on the prediction file