class BloomSpellChecker(object): def __init__(self): self.myBloomFilter = BloomFilter() def addWord( self, aWord ): self.myBloomFilter.addWord(aWord) def checkWord( self, aWordToCheck ): for theWordToCheck in self.generateWordOptions(aWordToCheck): if self.myBloomFilter.checkWord( theWordToCheck ): return theWordToCheck return "no correction found" def generateWordOptions( self, aWordToCheck ): theWordOptions = [ aWordToCheck, aWordToCheck.lower(), aWordToCheck.capitalize() ] theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck ) ) theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.lower() ) ) theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.capitalize() ) ) return theWordOptions def generateWordOptionsByRemovingRepeatingCharacters(self, aWord): theWordOptions = set() if len(aWord) <= 1: theWordOptions.add(aWord) else: theFirstLetter = aWord[0:1] for theIntermediateWordOption in self.generateWordOptionsByRemovingRepeatingCharacters(aWord[1:]): if theFirstLetter == theIntermediateWordOption[0]: theWordOptions.add( theFirstLetter + theIntermediateWordOption[1:] ) theWordOptions.add( theFirstLetter + theIntermediateWordOption[0:] ) return theWordOptions
def testFoo(self): theBloomFilter = BloomFilter( self.myTrackingHasher ) theBloomFilter.addWord( 'foo' ) theBloomFilter.addWord( 'foo1' ) theBloomFilter.addWord( 'foo2' ) theBloomFilter.addWord( 'foo3' ) theBloomFilter.addWord( 'foo4' ) theBloomFilter.addWord( 'foo5' ) theBloomFilter.addWord( 'foo6' ) self.myTrackingHasher.printTrackingData()
def testFoo(self): theBloomFilter = BloomFilter(self.myTrackingHasher) theBloomFilter.addWord('foo') theBloomFilter.addWord('foo1') theBloomFilter.addWord('foo2') theBloomFilter.addWord('foo3') theBloomFilter.addWord('foo4') theBloomFilter.addWord('foo5') theBloomFilter.addWord('foo6') self.myTrackingHasher.printTrackingData()
from BloomFilter import BloomFilter import random import string def removeMatchingWord(aWordToFind, aWords): if aWordToFind in aWords: aWords.remove(aWordToFind) if __name__ == '__main__': theBloomFilter = BloomFilter() [ theBloomFilter.addWord(line.strip()) for line in open('/usr/share/dict/words') ] theWrongWords = [] theCountOfRandomWords = 0 while len(theWrongWords) < 300: theCountOfRandomWords = theCountOfRandomWords + 1 theRandomString = ''.join( random.choice(string.ascii_lowercase) for x in range(5)) if (theBloomFilter.checkWord(theRandomString)): theWrongWords.append(theRandomString) print '# random words checked: ' + str(theCountOfRandomWords) [
from BloomFilter import BloomFilter import random import string def removeMatchingWord( aWordToFind, aWords ): if aWordToFind in aWords: aWords.remove( aWordToFind ) if __name__ == '__main__': theBloomFilter = BloomFilter() [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')] theWrongWords = [] theCountOfRandomWords = 0 while len(theWrongWords) < 300: theCountOfRandomWords = theCountOfRandomWords + 1 theRandomString = ''.join(random.choice(string.ascii_lowercase) for x in range(5)) if ( theBloomFilter.checkWord( theRandomString ) ): theWrongWords.append( theRandomString ) print '# random words checked: ' + str( theCountOfRandomWords ) [ removeMatchingWord( line.strip(), theWrongWords ) for line in open('/usr/share/dict/words')] print '# false positives: ' + str( len( theWrongWords ) ) print 'false positive rate: ' + str( 100.0 * float( len( theWrongWords ) ) / float( theCountOfRandomWords ) )
class TestBloomFilter(unittest.TestCase): def setUp( self ): self.myHasherForTest = HasherForTest() self.myBloomFilter = BloomFilter( self.myHasherForTest ) def testHasBitField(self): self.assertIsNotNone( self.myBloomFilter.myBitArray, 'bitfield does not exist' ) def testBitFieldDefaultsToFalse(self): self.assertFalse( self.myBloomFilter.myBitArray.any(), 'bitfield should be initialized to all Falses' ) def testAddWord(self): self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ] self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ] self.myBloomFilter.addWord( 'foo' ) self.myBloomFilter.addWord( 'bar' ) self.assertThatOnlyIndexesAreTrue( [ 4, 5, 2, 7 ] ) def assertThatOnlyIndexesAreTrue(self, anIndexes): theBitArrayCopy = bitarray( self.myBloomFilter.myBitArray ) for theIndex in anIndexes: self.assertTrue( self.myBloomFilter.myBitArray[theIndex], 'index ' + str( theIndex ) + ' is false when it should be true' ) theBitArrayCopy[theIndex] = False self.assertFalse( theBitArrayCopy.any(), 'an unexpected index(es) were True, they should be false: ' + str( theBitArrayCopy ) ) def testCheckWord(self): self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ] self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ] self.myBloomFilter.addWord( 'foo' ) self.assertTrue( self.myBloomFilter.checkWord( 'foo' ), 'word foo should be in the filter' ) self.assertFalse( self.myBloomFilter.checkWord( 'bar' ), 'word bar should NOT be in the filter' ) def testCheckWordWithRealHasher(self): theBloomFilter = BloomFilter() self.validateAddingWord( theBloomFilter, "foo" ); self.validateAddingWord( theBloomFilter, "bar" ); self.validateAddingWord( theBloomFilter, "barf" ); self.validateAddingWord( theBloomFilter, "barge" ); self.validateAddingWord( theBloomFilter, "barn" ); self.validateAddingWord( theBloomFilter, "bart" ); self.validateAddingWord( theBloomFilter, "fnarfle-pants" ); self.validateAddingWord( theBloomFilter, "BLARG" ); self.validateAddingWord( theBloomFilter, "blarg" ); self.validateAddingWord( theBloomFilter, "a" ); self.validateAddingWord( theBloomFilter, "aardvark" ); self.validateAddingWord( theBloomFilter, "platypus" ); self.validateAddingWord( theBloomFilter, "melee" ); self.validateAddingWord( theBloomFilter, "somethingreallylong" ); self.validateAddingWord( theBloomFilter, "carrot" ); self.validateAddingWord( theBloomFilter, "derpa derpa der" ); self.validateAddingWord( theBloomFilter, "b" ); self.validateAddingWord( theBloomFilter, "#winning" ); self.assertFalse( theBloomFilter.checkWord( "bat" )); self.assertFalse( theBloomFilter.checkWord( "mele" )); self.assertFalse( theBloomFilter.checkWord( "blah" )); def validateAddingWord(self, aBloomFilter, aWordToTest): self.assertFalse( aBloomFilter.checkWord( aWordToTest ) ); aBloomFilter.addWord( aWordToTest ); self.assertTrue( aBloomFilter.checkWord( aWordToTest ) );