class BloomSpellChecker(object):

    def __init__(self):
        self.myBloomFilter = BloomFilter()

    def addWord( self, aWord ):
        self.myBloomFilter.addWord(aWord)

    def checkWord( self, aWordToCheck ):
        for theWordToCheck in self.generateWordOptions(aWordToCheck):
            if self.myBloomFilter.checkWord( theWordToCheck ):
                return theWordToCheck
        return "no correction found"

    def generateWordOptions( self, aWordToCheck ):
        theWordOptions = [ aWordToCheck, aWordToCheck.lower(), aWordToCheck.capitalize() ]
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck ) )
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.lower() ) )
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.capitalize() ) )
        return theWordOptions

    def generateWordOptionsByRemovingRepeatingCharacters(self, aWord):
        theWordOptions = set()

        if len(aWord) <= 1:
            theWordOptions.add(aWord)
        else:
            theFirstLetter = aWord[0:1]
            for theIntermediateWordOption in self.generateWordOptionsByRemovingRepeatingCharacters(aWord[1:]):
                if theFirstLetter == theIntermediateWordOption[0]:
                    theWordOptions.add( theFirstLetter + theIntermediateWordOption[1:] )

                theWordOptions.add( theFirstLetter + theIntermediateWordOption[0:] )

        return theWordOptions
    def testFoo(self):
        theBloomFilter = BloomFilter( self.myTrackingHasher )
        theBloomFilter.addWord( 'foo' )
        theBloomFilter.addWord( 'foo1' )
        theBloomFilter.addWord( 'foo2' )
        theBloomFilter.addWord( 'foo3' )
        theBloomFilter.addWord( 'foo4' )
        theBloomFilter.addWord( 'foo5' )
        theBloomFilter.addWord( 'foo6' )

        self.myTrackingHasher.printTrackingData()
示例#3
0
    def testFoo(self):
        theBloomFilter = BloomFilter(self.myTrackingHasher)
        theBloomFilter.addWord('foo')
        theBloomFilter.addWord('foo1')
        theBloomFilter.addWord('foo2')
        theBloomFilter.addWord('foo3')
        theBloomFilter.addWord('foo4')
        theBloomFilter.addWord('foo5')
        theBloomFilter.addWord('foo6')

        self.myTrackingHasher.printTrackingData()
from BloomFilter import BloomFilter
import random
import string


def removeMatchingWord(aWordToFind, aWords):
    if aWordToFind in aWords:
        aWords.remove(aWordToFind)


if __name__ == '__main__':
    theBloomFilter = BloomFilter()
    [
        theBloomFilter.addWord(line.strip())
        for line in open('/usr/share/dict/words')
    ]

    theWrongWords = []
    theCountOfRandomWords = 0

    while len(theWrongWords) < 300:
        theCountOfRandomWords = theCountOfRandomWords + 1
        theRandomString = ''.join(
            random.choice(string.ascii_lowercase) for x in range(5))

        if (theBloomFilter.checkWord(theRandomString)):
            theWrongWords.append(theRandomString)

    print '# random words checked: ' + str(theCountOfRandomWords)

    [
from BloomFilter import BloomFilter
import random
import string

def removeMatchingWord( aWordToFind, aWords ):
    if aWordToFind in aWords:
        aWords.remove( aWordToFind )

if __name__ == '__main__':
    theBloomFilter = BloomFilter()
    [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')]

    theWrongWords = []
    theCountOfRandomWords = 0

    while len(theWrongWords) < 300:
        theCountOfRandomWords = theCountOfRandomWords + 1
        theRandomString = ''.join(random.choice(string.ascii_lowercase) for x in range(5))

        if ( theBloomFilter.checkWord( theRandomString ) ):
            theWrongWords.append( theRandomString )

    print '# random words checked: ' + str( theCountOfRandomWords )

    [ removeMatchingWord( line.strip(), theWrongWords ) for line in open('/usr/share/dict/words')]

    print '# false positives: ' + str( len( theWrongWords ) )

    print 'false positive rate: ' + str( 100.0 * float( len( theWrongWords ) ) / float( theCountOfRandomWords ) )
示例#6
0
class TestBloomFilter(unittest.TestCase):

    def setUp( self ):
        self.myHasherForTest = HasherForTest()
        self.myBloomFilter = BloomFilter( self.myHasherForTest )

    def testHasBitField(self):
        self.assertIsNotNone( self.myBloomFilter.myBitArray, 'bitfield does not exist' )

    def testBitFieldDefaultsToFalse(self):
        self.assertFalse( self.myBloomFilter.myBitArray.any(), 'bitfield should be initialized to all Falses' )

    def testAddWord(self):
        self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ]
        self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ]

        self.myBloomFilter.addWord( 'foo' )
        self.myBloomFilter.addWord( 'bar' )

        self.assertThatOnlyIndexesAreTrue( [ 4, 5, 2, 7 ] )

    def assertThatOnlyIndexesAreTrue(self, anIndexes):
        theBitArrayCopy = bitarray( self.myBloomFilter.myBitArray )

        for theIndex in anIndexes:
            self.assertTrue( self.myBloomFilter.myBitArray[theIndex], 'index ' + str( theIndex ) + ' is false when it should be true' )
            theBitArrayCopy[theIndex] = False

        self.assertFalse( theBitArrayCopy.any(), 'an unexpected index(es) were True, they should be false: ' + str( theBitArrayCopy ) )

    def testCheckWord(self):
        self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ]
        self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ]
        self.myBloomFilter.addWord( 'foo' )

        self.assertTrue( self.myBloomFilter.checkWord( 'foo' ), 'word foo should be in the filter' )
        self.assertFalse( self.myBloomFilter.checkWord( 'bar' ), 'word bar should NOT be in the filter' )

    def testCheckWordWithRealHasher(self):
        theBloomFilter = BloomFilter()

        self.validateAddingWord( theBloomFilter, "foo" );
        self.validateAddingWord( theBloomFilter, "bar" );
        self.validateAddingWord( theBloomFilter, "barf" );
        self.validateAddingWord( theBloomFilter, "barge" );
        self.validateAddingWord( theBloomFilter, "barn" );
        self.validateAddingWord( theBloomFilter, "bart" );
        self.validateAddingWord( theBloomFilter, "fnarfle-pants" );
        self.validateAddingWord( theBloomFilter, "BLARG" );
        self.validateAddingWord( theBloomFilter, "blarg" );
        self.validateAddingWord( theBloomFilter, "a" );
        self.validateAddingWord( theBloomFilter, "aardvark" );
        self.validateAddingWord( theBloomFilter, "platypus" );
        self.validateAddingWord( theBloomFilter, "melee" );
        self.validateAddingWord( theBloomFilter, "somethingreallylong" );
        self.validateAddingWord( theBloomFilter, "carrot" );
        self.validateAddingWord( theBloomFilter, "derpa derpa der" );
        self.validateAddingWord( theBloomFilter, "b" );
        self.validateAddingWord( theBloomFilter, "#winning" );

        self.assertFalse( theBloomFilter.checkWord( "bat" ));
        self.assertFalse( theBloomFilter.checkWord( "mele" ));
        self.assertFalse( theBloomFilter.checkWord( "blah" ));

    def validateAddingWord(self, aBloomFilter, aWordToTest):
        self.assertFalse( aBloomFilter.checkWord( aWordToTest ) );
        aBloomFilter.addWord( aWordToTest );
        self.assertTrue( aBloomFilter.checkWord( aWordToTest ) );