示例#1
0
class BloomFilterSearch():
    def __init__(self, ciphers_dict, bit_vec, hash):
        self.ciphers_dict = ciphers_dict
        self.bit_vec = bit_vec
        self.hash = hash
        self.false_positives = 0
        self.filter = BloomFilter(self.bit_vec, self.hash)

    def SearchWord(self, dictionary, word):
        server = PEKSServer(dictionary[word][0])
        server.Test(self.ciphers_dict, dictionary[word][1])

    def insert(self, s1, cipher):
        hashGen = hashlib.sha256()
        hashGen.update(str(s1[0]).encode())
        hash = hashGen.hexdigest()
        self.filter.insert(str(aes.decrypt(cipher, hash[:32])))

    def percentage(self, part, whole):
        return 100 * float(part) / float(whole)

    def bl_search(self, dict, store):
        for w in dict:
            result = self.filter.check(str(dict[w][1]))
            if (result is True):
                if (w not in store):
                    self.false_positives += 1

        p = self.percentage(self.false_positives, len(store))
        percent.append(p)
        o.write('Size of bit vector: ' + str(self.bit_vec) + '\n')
        o.write('Number of hashes used: ' + str(self.hash) + '\n')
        o.write('Number of false positives: ' + str(self.false_positives) +
                '\n')
        o.write('Percentage of false positives: ' + str(p) + '% \n\n\n')
示例#2
0
def computeBloomFilter():
    filename = input_path

    # total # of line in the file
    user_file_NOL = 0

    # content frequency
    user_file_content = {}

    with open(filename) as user_file:
        for line in user_file:
            user_file_NOL += 1
    # creates bloomfilter of required size
    bloom_filter = BloomFilter(user_file_NOL)

    # read contents of file and insert into BF
    with open(filename) as user_file:
        for line in user_file:
            try:
                user_file_content[line] += 1
            except:
                user_file_content[line] = 1
            bloom_filter.insert(line, freq=user_file_content[line])

    return bloom_filter
 def setUp(self):
     self.b_filter = BloomFilter()
     self.control = set()
     for i in range(0, 400):
         elem = randint(0, 1000)
         self.b_filter.add_element(elem)
         self.control.add(elem)
示例#4
0
    def make_experiment(self):
        self.timeStart = time.time()

        for i in range(0, self.b):
            #Build the bloomFilter
            bf = BloomFilter(self.m, self.k)

            #Insert N times odd items to the bloomFilter
            for i in range(0, self.n):
                rnumber = random.randint(0, self.n)
                rnumber = rnumber * 2 + 1
                bf.insert(str(rnumber))

        #Check N times even items from the bloomFilter
            for i in range(0, self.n):
                rnumber = random.randint(0, self.n)
                rnumber = rnumber * 2
                result = bf.check(str(rnumber))

                if (result is True):
                    self.falsePositiveCounter = self.falsePositiveCounter + 1

        self.falsePositiveCounter = self.falsePositiveCounter / self.b
        self.timeFinish = time.time()

        #return [(self.timeFinish-self.timeStart),self.percentage(self.falsePositiveCounter,self.n)]
        return self.percentage(self.falsePositiveCounter, self.n)
示例#5
0
  def make_experiment(self):
    self.timeStart = time.time()
    
    for i in range(0,self.b):
      #Build the bloomFilter
      bf = BloomFilter(self.m,self.k)

      #Insert N times odd items to the bloomFilter
      for i in range(0,self.n):
        rnumber = random.randint(0,self.n)
        rnumber = rnumber*2 + 1
        bf.insert(str(rnumber))
    
    #Check N times even items from the bloomFilter
      for i in range(0,self.n):
        rnumber = random.randint(0,self.n)
        rnumber = rnumber*2
        result = bf.check(str(rnumber))

        if(result is True):
          self.falsePositiveCounter = self.falsePositiveCounter + 1

    self.falsePositiveCounter = self.falsePositiveCounter / self.b
    self.timeFinish = time.time()

    #return [(self.timeFinish-self.timeStart),self.percentage(self.falsePositiveCounter,self.n)]
    return self.percentage(self.falsePositiveCounter,self.n)
class BloomSpellChecker(object):

    def __init__(self):
        self.myBloomFilter = BloomFilter()

    def addWord( self, aWord ):
        self.myBloomFilter.addWord(aWord)

    def checkWord( self, aWordToCheck ):
        for theWordToCheck in self.generateWordOptions(aWordToCheck):
            if self.myBloomFilter.checkWord( theWordToCheck ):
                return theWordToCheck
        return "no correction found"

    def generateWordOptions( self, aWordToCheck ):
        theWordOptions = [ aWordToCheck, aWordToCheck.lower(), aWordToCheck.capitalize() ]
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck ) )
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.lower() ) )
        theWordOptions.extend( self.generateWordOptionsByRemovingRepeatingCharacters( aWordToCheck.capitalize() ) )
        return theWordOptions

    def generateWordOptionsByRemovingRepeatingCharacters(self, aWord):
        theWordOptions = set()

        if len(aWord) <= 1:
            theWordOptions.add(aWord)
        else:
            theFirstLetter = aWord[0:1]
            for theIntermediateWordOption in self.generateWordOptionsByRemovingRepeatingCharacters(aWord[1:]):
                if theFirstLetter == theIntermediateWordOption[0]:
                    theWordOptions.add( theFirstLetter + theIntermediateWordOption[1:] )

                theWordOptions.add( theFirstLetter + theIntermediateWordOption[0:] )

        return theWordOptions
示例#7
0
    def testCheckWordWithRealHasher(self):
        theBloomFilter = BloomFilter()

        self.validateAddingWord( theBloomFilter, "foo" );
        self.validateAddingWord( theBloomFilter, "bar" );
        self.validateAddingWord( theBloomFilter, "barf" );
        self.validateAddingWord( theBloomFilter, "barge" );
        self.validateAddingWord( theBloomFilter, "barn" );
        self.validateAddingWord( theBloomFilter, "bart" );
        self.validateAddingWord( theBloomFilter, "fnarfle-pants" );
        self.validateAddingWord( theBloomFilter, "BLARG" );
        self.validateAddingWord( theBloomFilter, "blarg" );
        self.validateAddingWord( theBloomFilter, "a" );
        self.validateAddingWord( theBloomFilter, "aardvark" );
        self.validateAddingWord( theBloomFilter, "platypus" );
        self.validateAddingWord( theBloomFilter, "melee" );
        self.validateAddingWord( theBloomFilter, "somethingreallylong" );
        self.validateAddingWord( theBloomFilter, "carrot" );
        self.validateAddingWord( theBloomFilter, "derpa derpa der" );
        self.validateAddingWord( theBloomFilter, "b" );
        self.validateAddingWord( theBloomFilter, "#winning" );

        self.assertFalse( theBloomFilter.checkWord( "bat" ));
        self.assertFalse( theBloomFilter.checkWord( "mele" ));
        self.assertFalse( theBloomFilter.checkWord( "blah" ));
示例#8
0
 def run(self):
     # username = '******'
     username = '******'
     # password = '******'
     password = '******'
     while True:
         self.login_index(username, password)  # 登录
         self.driver.save_screenshot('login.png')
         XieChenGSpider.delete_old_day()  # 删除过期数据
         print(XieChenGSpider.Num)
         select_hotel = """SELECT hotel FROM t_cl_xiecheng_hotel"""
         db = pymysql.connect(host='47.92.162.87',
                              port=3306,
                              user='******',
                              password='******',
                              db='db_bby_xiecheng')
         cursor = db.cursor()
         cursor.execute(select_hotel)
         result = cursor.fetchall()
         db.close()
         for hotel in result:
             hotel_name = hotel[0]
             print(hotel_name)
             bf = BloomFilter()
             if bf.isContains(hotel_name +
                              str(XieChenGSpider.Num)):  # 判断字符串是否存在
                 print('exists!')
             else:
                 print('not exists!')
                 bf.insert(hotel_name + str(XieChenGSpider.Num))
                 hotel_url = self.get_url(hotel_name)
                 self.get_hotel_page(hotel_url)
         XieChenGSpider.Num += 1
         self.driver.quit()
示例#9
0
def response(flow):
    url = 'https://restapi.ele.me/mix/app/channelPage?extras[]=coupon&scene=app:channel'
    # print(flow.request.cookies)
    # print(flow.request.cookies)
    print(flow.request.headers['User-Agent'])
    print('#' * 90)
    if flow.request.url.startswith(url) and 'costFrom' in flow.request.url:
        bf = BloomFilter()
        text = flow.response.text
        data = json.loads(text)
        items_list = data.get('recommendList').get('items')
        for items in items_list:
            shop_id = items.get('restaurant').get('id')
            shop_name = items.get('restaurant').get('name')
            shop_url = shop_url_path.format(shop_id)
            print(shop_id, shop_name, shop_url)
            # addr = get_addr(shop_id)  # 详细地址
            # phone = get_phone(shop_id)  # 电话
            # licenses = get_license(shop_id)  # 营业执照url
            # print(shop_id, shop_name, shop_url, licenses)
            if bf.isContains(shop_id):  # 判断字符串是否存在
                print('exists!')
            else:
                print('not exists!')
                bf.insert(shop_id)
                data_list = [
                    shop_id, shop_name, '珠江摩尔国际大厦8号楼', '', shop_url, '', '昌平区',
                    '北京市', ' ', ''
                ]
                with open(r'新店饿了么.csv', 'a+', encoding='utf-8-sig',
                          newline='') as file:
                    writer = csv.writer(file, dialect='excel')
                    writer.writerow(data_list)
            print('*' * 100)
示例#10
0
def main(num_keys, file_dir):
    input_keys = int(num_keys)
    bf = BloomFilter(input_keys)
    for line in fileinput.input(file_dir):
        line = line.strip()
        if bf.query(line) == 0:
            bf.insert(line)
            print line
示例#11
0
 def test_hash_indexes(self):
     k = 3
     m = 128
     bloom = BloomFilter(m, k)
     hash_indexes = bloom.hash_indexes('#')
     self.assertEqual(len(hash_indexes), 3)
     for index in hash_indexes:
         self.assertIn(index, range(0, m))
示例#12
0
	def __init__(self):
		#Initialisation de filtre avec une taille de 1200 bits 
		#et deux fonctions de hashage 
		self.tableau=[]
		self.nb_function=1
		self.taille=33000
		self.filtre=BloomFilter(300,2)
		self.faux_positive=0
		self.res="resultBloom.csv"
示例#13
0
def calc2():
    fo = open('pg1661.txt', 'r')
    count = 0
    reads = BloomFilter(320000)
    for i in range(10000):
        s = fo.readline()
        l = s.split(' ')
        for i in l:
            count += reads.BloomFilter(i)
    return count
示例#14
0
    def testFoo(self):
        theBloomFilter = BloomFilter(self.myTrackingHasher)
        theBloomFilter.addWord('foo')
        theBloomFilter.addWord('foo1')
        theBloomFilter.addWord('foo2')
        theBloomFilter.addWord('foo3')
        theBloomFilter.addWord('foo4')
        theBloomFilter.addWord('foo5')
        theBloomFilter.addWord('foo6')

        self.myTrackingHasher.printTrackingData()
示例#15
0
def makeFilterloadPayload(publicKeyHash, scriptHash, transactionID):
    filter = BloomFilter(3, 0.01, 0, 1)
    filter.add(publicKeyHash)
    filter.add(scriptHash)
    filter.add(transactionID)
    print(len(filter.bit_array))
    payload = filter.serialize()
    return payload
示例#16
0
def main():
    # bf = BF(mbits=15000, nitems=680)
    bf = BF()
    # --- Insertion to bloomfilter pattern ---
    bf.insertToBloomFilterPattern('4.6649981966.7720977692.5995632615.525560175001.063711911256403200000000')
    print bf.lookupFromBloomFilterPattern('4.6649981966.7720977692.5995632615.525560175001.063711911256403200000000')
    print bf.lookupFromBloomFilterPattern('4.7684212566.6980465244.51134545.522727217001.006710836100502560.2343192991111110')
    bf.setInitialElementsOfBloomFilterPattern('../rawfiles/bf_initialcandidate.csv')
示例#17
0
 def create_bloom_filter(self, data):
     print("Creating bloom filter")
     false_negatives = []
     preds = self.model.predicts(data.positives)
     for i in range(len(data.positives)):
         if preds[i] <= self.threshold:
             false_negatives.append(data.positives[i])
     print("Number of false negatives at bloom time", len(false_negatives))
     self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2,
                                     string_digest)
     for fn in false_negatives:
         self.bloom_filter.add(fn)
     print("Created bloom filter")
示例#18
0
class OccurenceBloom:
	def __init__(self):
		#Initialisation de filtre avec une taille de 1200 bits 
		#et deux fonctions de hashage 
		self.tableau=[]
		self.nb_function=1
		self.taille=33000
		self.filtre=BloomFilter(300,2)
		self.faux_positive=0
		self.res="resultBloom.csv"
	def ajouter(self,adresse):
		#Verifier si le hash de l adresse existe dans le filtre 
		if(self.filtre.existe(adresse)):
			#Si oui , on verifie si ce n'est pas un faux positive			
			existe=False
			taille=len(self.tableau)
			#Recherche normal dans le tableau de frequence 	
			for i in range(taille):
				if(self.tableau[i][0]==adresse):
					existe=True
					self.tableau[i]=(adresse,self.tableau[i][1]+1)
					break
			if not existe:
			#Un faux positive detecte, on insere la nouvelle adresse
				self.tableau.append((adresse,1))
				self.faux_positive+=1				
		else:
		#Le hash n existe pas on insere directement a la fin 
			self.tableau.append((adresse,1))
		#On mets a jour le filtre 	
		self.filtre.ajouter(adresse)
	
	def sauvegarder(self):
		with open("save.bloom","wb") as sauvegarde:
			pickle.dump(self, sauvegarde, pickle.HIGHEST_PROTOCOL)			
		print("sauvegarde en cours ....")

	def reset(self):
		self.tableau=[]
		self.filtre=BloomFilter(self.taille,self.nb_function)
		self.faux_positive=0
	def dix_premier(self):
		return sorted(self.tableau,key=lambda x:-x[1])[0:10]

	def changer_taille(self,taille):
		self.taille=taille
		self.filtre=BloomFilter(taille,self.nb_function)
		
	def changer_nb_fct(self,nb_function):
		self.nb_function=nb_function
		self.filtre=BloomFilter(self.taille,nb_function)	
class TestBloomFilter(TestCase):
    def setUp(self):
        self.b_filter = BloomFilter()
        self.control = set()
        for i in range(0, 400):
            elem = randint(0, 1000)
            self.b_filter.add_element(elem)
            self.control.add(elem)

    def test_check_element(self):
        for i in range(0, 100000):
            elem = randint(0, 1000)
            if not self.b_filter.check_element(elem):
                self.assertTrue(elem not in self.control)
示例#20
0
class LocalBloomFilter():
    def __init__(self, capacity, error, prime_length=True):
        self.bf = BloomFilter(capacity, error, prime_length)
        self.bitmap = bytes(int(self.bf.bits / 8) + 1)

    def add(self, data):
        if isinstance(data, (list, tuple)):
            for v in data:
                assert isinstance(
                    v, str), 'add() arg must be a str or list/tuple of strings'
                self.bf.add(self.bitmap, v)
        else:
            assert isinstance(
                data, str), 'add() arg must be a str or list/tuple of strings'
            self.bf.add(self.bitmap, data)

    def is_contain(self, data):
        if isinstance(data, (list, tuple)):
            for v in data:
                assert isinstance(
                    v, str
                ), 'is_contain() arg must be a str or list/tuple of strings'
            return [self.bf.is_contain(self.bitmap, v) for v in data]
        else:
            assert isinstance(
                data,
                str), 'is_contain() arg must be a str or list/tuple of strings'
            return self.bf.is_contain(self.bitmap, data)

    def clean(self):
        self.bf.clean_bitmap(self.bitmap)
    def testFoo(self):
        theBloomFilter = BloomFilter( self.myTrackingHasher )
        theBloomFilter.addWord( 'foo' )
        theBloomFilter.addWord( 'foo1' )
        theBloomFilter.addWord( 'foo2' )
        theBloomFilter.addWord( 'foo3' )
        theBloomFilter.addWord( 'foo4' )
        theBloomFilter.addWord( 'foo5' )
        theBloomFilter.addWord( 'foo6' )

        self.myTrackingHasher.printTrackingData()
 def test_working(self):
     bloom_filter = BloomFilter(25)
     #check if it works as expected
     bloom_filter.add_word("these")
     self.assertTrue(bloom_filter.check_word("these"))
     bloom_filter.add_word("fdfdf")
     self.assertTrue(bloom_filter.check_word("fdfdf"))
     self.assertFalse(bloom_filter.check_word("plaban"))
示例#23
0
def getMissingContent(n, bloomfilter_bytes):
    missing_content = {}
    receivedBF = BloomFilter(n)
    receivedBF.readBloomFilterFromBytes(bloomfilter_bytes)
    user_file_content = {}
    line_number = 0
    with open(input_path) as user_file:
        for line in user_file:
            line_number += 1
            try:
                user_file_content[line] += 1
            except:
                user_file_content[line] = 1
            if not receivedBF.validate(line, freq=user_file_content[line]):
                missing_content[line_number] = line
    return(missing_content)
示例#24
0
def test_real_fp_prob(filename,
                      fp_prob=0.01,
                      hash_cnt=3,
                      iteration=30,
                      test_size=10000):
    """
    test false positive probability by given desgired false positive probability and given k hash function constraints
    """
    word_list = []
    int_file = None
    try:
        # use ISO for weird character
        int_file = open(filename, "r", encoding="ISO-8859-1")
        while True:
            line = int_file.readline().replace('\n', '')
            if not line:
                break
            word_list.append(line)
    except Exception as excep:
        raise
    finally:
        if int_file is not None:
            int_file.close()

    print(
        f'==============================================================================================================='
    )
    print(
        f'test false positive probability by given desgired false positive probability and given k hash function constraints'
    )
    print(
        f'==============================================================================================================='
    )
    if test_size <= 0:
        raise ValueError("test_size must bigger than zero")
    if len(word_list) <= 1:
        raise ValueError("word_list_size must be at least 2")

    word_present_cnt = (int)(len(word_list) / 2)
    word_absent_cnt = len(word_list) - word_present_cnt

    word_present = word_list[:word_present_cnt]
    word_absent = word_list[word_present_cnt:]

    # number of items being insert
    items_count = word_present_cnt

    bloom_filter = BloomFilter(items_count, fp_prob, hash_cnt)
    print(f'word_list size: {len(word_list)}')
    print(f'iteration: {iteration}')
    print(f'input test size: {test_size}')

    # use smaller cnt as possible test_size
    test_size = min(test_size, len(word_list))
    print(f'adjusted test size: {test_size}')

    for iter in range(iteration):
        print(f'test false positive rate: \
            {format(evaluate_error_rate(word_present, word_absent, bloom_filter, min(test_size, len(word_list))), "f")}'
              )
示例#25
0
def read_write_test(word, items_count, fp_rpob, hash_cnt):
    """
    test single write and read test
    """
    print(
        f'==============================================================================================================='
    )
    print(f'single write and read test')
    print(
        f'==============================================================================================================='
    )
    # eg : items_count = 1000, fp_prob = 0.01, hash_cnt=3,
    bloom_filter = BloomFilter(items_count, fp_rpob, hash_cnt)
    bloom_filter.add(word)
    print(f'add data: {word}')
    print(f'look up data: {word} = {bloom_filter.may_match(word)}')
示例#26
0
def evaluate_fp_rate(items_count, fp_list, hash_cnt_list):
    """
     evaluate false positive rate by given desgired false positive probability and given k hash function constraints
    """

    print(
        f'==============================================================================================================='
    )
    print(
        f'evaluate false positive probailitiy by given desgired false positive probability and given k hash function constraints'
    )
    print(
        f'==============================================================================================================='
    )

    for fp_prob in fp_list:
        for hash_cnt in hash_cnt_list:

            size = BloomFilter.get_size_by_hash_count_and_fp_prob(
                items_count, hash_cnt, fp_prob)

            ratio = size / items_count

            print(f'false_positive:{fp_prob} \
                array_size: {size} \
                hash_count: {hash_cnt} \
                ratio: {format(ratio, ".2e")} \
                space(MB): {format(size / math.pow(2, 20) / 8, "f")}')
示例#27
0
 def __init__(self):
     self.my = OperateSql()
     self.bf = BloomFilter()
     self.license_url = 'https://h5.ele.me/restapi/shopping/v1/restaurants/{}/business/qualification'
     self.license_path = 'https://cube.elemecdn.com/{}.jpeg'
     self.headers = {
         "accept":
         "application/json, text/plain, */*",
         "accept-encoding":
         "gzip, deflate, br",
         "accept-language":
         "zh-CN,zh;q=0.9,en;q=0.8",
         "cookie":
         "__wpkreporterwid_=5c1b9625-e0e5-4191-94ec-617c63d85828; ubt_ssid=orikwvv98rf349gzuztl6qswlsehx95b_2019-10-31; perf_ssid=39tu4b3xkbrpyw3gwzyrroi0a37e5ltj_2019-10-31; ut_ubt_ssid=6puu2k5t5iqqze593aa84gqsb402sks6_2019-10-31; _bl_uid=IgkgO2v0e0k6LwqmCrz7e2O8wRUz; cna=gY2qExKVBQECAd9YCuTqIvMq; _utrace=70d38df1be30fbbdbbc6a5282dd6ce95_2019-10-31; track_id=1572494757|16968ea868ee345e15bdb0c6f4d78b23e2e3c4d4569c374fae|bb82e1bf879b4737f8c5f86195210994; USERID=10475002; tzyy=0a37b285480279db053bbd4cbb8d8310; UTUSER=10475002; SID=AjfMV8XFn8xrZJTbt0C8i8m1oqB2DbruQxoQ; ZDS=1.0|1572503692|RGAZN+cBWA5CW8xwYLcUH12DMCx9UoNnWZPQMDqhx1gO8fOuCaRY/HxKf+BFRrzP30TX/MIJZjpOiqnkcPPGYw==; l=dBLedHhnqFN5qTL5BOfgmuI8Si_tyIRfGsPzw4GXtICPOdCePBpVWZQZ7WYwCnGVnswvR37NvzWDBV8nkyzHQgfYduwXXEigzd8h.; pizza-rc-ca-result=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhbGciOiJIUzI1NiIsImV4cCI6MTU3MjU4MDI1NCwic2FsdDEiOiJiMGI3MzQzZWU5MTc4YzI1ODgxMWQwZDczMTg1NDU4YSIsInNhbHQyIjoiNjk5MzhlNjkwMmZkOTA5YTFjNjFhOTdlZGZiZmZiNzkifQ.dzzgRy_scGB0QBifCrNavjsDkSyX4a2a-ubs9nkX0Fo; pizza73686f7070696e67=_HHDoSEnvf2II7jNDsoNPY2O8tPmnHsxgglXdGQnv3Q27pXDbCCoc4uL5jGRs6EA; isg=BFZW_vooF5q4QyOfTzbsbr0gpwyYX_89LP606MC_QjnUg_YdKIfqQbx1H1nKK5JJ",
         "user-agent":
         "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36"
     }
示例#28
0
class DeepBloom(object):
    def __init__(self, model, data, fp_rate):
        self.model = model
        self.threshold = None
        self.fp_rate = float(fp_rate)
        self.fit(data)
        self.create_bloom_filter(data)

    def check(self, item):
        if self.model.predict(item) > self.threshold:
            return True
        return self.bloom_filter.check(item)

    def create_bloom_filter(self, data):
        print("Creating bloom filter")
        false_negatives = []
        preds = self.model.predicts(data.positives)
        for i in range(len(data.positives)):
            if preds[i] <= self.threshold:
                false_negatives.append(data.positives[i])
        print("Number of false negatives at bloom time", len(false_negatives))
        self.bloom_filter = BloomFilter(len(false_negatives), self.fp_rate / 2,
                                        string_digest)
        for fn in false_negatives:
            self.bloom_filter.add(fn)
        print("Created bloom filter")

    def fit(self, data):
        ## Split negative data into subgroups.
        (s1, s2) = split_negatives(data)
        print("Training model with train, dev, positives", len(s1), len(s2),
              len(data.positives))

        ## Shuffle together subset of negatives and positives.
        ## Then, train the model on this data.
        shuffled = shuffle_for_training(s1, data.positives)
        self.model.fit(shuffled[0], shuffled[1])
        print("Done fitting")

        ## We want a threshold such that at most s2.size * fp_rate/2 elements
        ## are greater than threshold.
        fp_index = math.ceil((len(s2) * (1 - self.fp_rate / 2)))
        predictions = self.model.predicts(s2)
        predictions.sort()
        self.threshold = predictions[fp_index]
示例#29
0
 def test_bloom_creation(self):
     k = 3
     m = 128
     bits = bitarray(m)
     bits.setall(0)
     bloom = BloomFilter(m, k)
     self.assertEqual(bloom.k, k)
     self.assertEqual(bloom.size, m)
     self.assertEqual(bloom.bits, bits)
     self.assertEqual(len(bloom.hashFunctions), k)
示例#30
0
 def test_contain(self):
     k = 3
     m = 128
     bloom = BloomFilter(m, k)
     self.assertFalse(bloom.test('#'))
     bloom.add("#")
     self.assertTrue(bloom.test('#'))
示例#31
0
def test_bloom_filter():
    bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY)
    word_present = ['abound', 'abounds', 'abundance', 'abundant', 'accessable',
                    'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses',
                    'coherent', 'cohesive', 'colorful', 'comely', 'comfort',
                    'gems', 'generosity', 'generous', 'generously', 'genial']

    word_absent = ['facebook', 'twitter']

    for item in word_present:
        bloomfilter.add(item)

    test_words = word_present[:10] + word_absent
    shuffle(test_words)
    for word in test_words:
        if bloomfilter.is_member(word):
            if word in word_absent:
                print(f"'{word}' is a false positive!")
            else:
                print(f"'{word}' is probably present!")
        else:
            print(f"'{word}' is definitely not present!")
示例#32
0
def main():
    parser = ag.ArgumentParser()
    parser_build = ag.ArgumentParser(add_help=False)
    parser_query = ag.ArgumentParser(add_help=False)
    #subparsers = parser.add_subparsers()

    #parser_build.add_argument('command', type = str, default = 'build')
    parser_build.add_argument('-k',
                              type=str,
                              help="Key File",
                              required=True,
                              dest='k')
    parser_build.add_argument('-f',
                              type=float,
                              help="FPR",
                              required=True,
                              dest='f')
    parser_build.add_argument('-n',
                              type=int,
                              help="Number of distinct keys",
                              required=True)
    parser_build.add_argument('-o',
                              type=str,
                              help="Output file to store input",
                              required=True)

    parser_query.add_argument('-i',
                              type=str,
                              help="Input file containing bloomFilter array",
                              required=True)
    parser_query.add_argument('-q',
                              type=str,
                              help="Input file containing queries",
                              required=True)

    subparsers = parser.add_subparsers()
    subparser_build = subparsers.add_parser("build", parents=[parser_build])
    subparser_build.set_defaults(which="build")
    subparser_query = subparsers.add_parser("query", parents=[parser_query])
    subparser_query.set_defaults(which="query")

    args = parser.parse_args()
    #print(parser_build)

    if (args.which == 'build'):
        #args = parser.parse_args()
        BF = BloomFilter(args.f, args.n)
        BF.insert(args.k)
        with open(args.o, "wb") as f:
            Pi.dump(BF, f)

    elif (args.which == 'query'):
        if (os.path.exists(args.i)):
            with open(args.i, "rb") as f:
                BF = Pi.load(f)
                BF.query_file(args.q)
        else:
            sys.exit("Input file does not exists")
示例#33
0
 def create_bloom_filter(self, data):
     print("Creating bloom filter")
     false_negatives = []
     preds = []
     for i in range(self.k):
         preds.append(self.models[i].predicts(data.positives))
     for j in range(len(data.positives)):
         is_false = True
         for i in range(self.k):
             pred = preds[i][j]
             if pred > self.thresholds[i]:
                 is_false = False
         if is_false:      
             false_negatives.append(data.positives[j])
     print("Number of false negatives at bloom time", len(false_negatives))
     print("Effective bloom filter false positive rate", self.fp_rate_bloom)
     self.bloom_filter = BloomFilter(
         len(false_negatives),
         self.fp_rate_bloom,
         string_digest
     )
     for fn in false_negatives:
         self.bloom_filter.add(fn)
     print("Created bloom filter")
示例#34
0
def url_test(positives, negatives, fp_rate):
    bf = BloomFilter(len(positives), fp_rate, string_digest)
    for pos in positives:
        bf.add(pos)
        assert (bf.check(pos))
    print("Bits needed", bf.size)
    print("Hash functions needed", bf.hash_count)

    fp = 0.0
    for neg in negatives:
        if bf.check(neg):
            fp += 1
    print("False positives", fp / len(negatives))
示例#35
0
	def __init__(self, name,path,switch=False,solr=None):
		self.name=name
		self.switch=switch
		self.path=path+'\logs\\'+self.name
		if not os.path.exists(path+'\logs'):
			os.mkdir(path+'\logs')
		if not os.path.exists(self.path):
			os.mkdir(self.path)
		self.routor=Routor(name,path)
		self.queue=Queue.Queue(maxsize=0)
		self.failQueue=Queue.Queue(maxsize=0)#失败队列
		self.logger=self.newLogging(name)
		self.bloomfilter=BloomFilter()
		self.count=0
		self.queue.put(self.routor.route[0]['pattern'])
		self.sleeptime=self.routor.route[-1]['sleeptime']
		self.block=SleepTime(self.sleeptime)#屏蔽模块
		self.fail=0
		self.job=deal(name,'job',path,solr)
		self.company=deal(name,'company',path,solr)
示例#36
0
文件: unit.py 项目: docete/tomoko
class BloomFilterTestCase(unittest.TestCase):
    def setUp(self):
        self.bf = BloomFilter(262144, 14)
        lines = open("/usr/share/dict/american-english").read().splitlines()
        for line in lines:
            self.bf.update(line)

    def tearDown(self):
        pass

    def test_probably(self):
        self.assertEqual(self.bf.lookup("Max"), "Probably")
        self.assertEqual(self.bf.lookup("mice"), "Probably")

    def test_nope(self):
        self.assertEqual(self.bf.lookup("3"), "Nope")
        self.assertEqual(self.bf.lookup("google"), "Nope")
示例#37
0
from BloomFilter import BloomFilter, TrackingHasher

if __name__ == '__main__':
    theHasher = TrackingHasher()
    theBloomFilter = BloomFilter( theHasher )
#    theBloomFilter = BloomFilter()

    [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')]

    theHasher.printTrackingData()

    while True:
        theInput = raw_input( 'Enter a word: ' )

        if theInput == 'q!':
            break

        if ( theBloomFilter.checkWord( theInput ) ):
            print theInput + ' was found!'
        else:
            print theInput + ' was not found...'
            theBloomFilter.printDebuggingData( theInput )
class TestBloomFilter(unittest.TestCase):

    def setUp( self ):
        self.myHasherForTest = HasherForTest()
        self.myBloomFilter = BloomFilter( self.myHasherForTest )

    def testHasBitField(self):
        self.assertIsNotNone( self.myBloomFilter.myBitArray, 'bitfield does not exist' )

    def testBitFieldDefaultsToFalse(self):
        self.assertFalse( self.myBloomFilter.myBitArray.any(), 'bitfield should be initialized to all Falses' )

    def testAddWord(self):
        self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ]
        self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ]

        self.myBloomFilter.addWord( 'foo' )
        self.myBloomFilter.addWord( 'bar' )

        self.assertThatOnlyIndexesAreTrue( [ 4, 5, 2, 7 ] )

    def assertThatOnlyIndexesAreTrue(self, anIndexes):
        theBitArrayCopy = bitarray( self.myBloomFilter.myBitArray )

        for theIndex in anIndexes:
            self.assertTrue( self.myBloomFilter.myBitArray[theIndex], 'index ' + str( theIndex ) + ' is false when it should be true' )
            theBitArrayCopy[theIndex] = False

        self.assertFalse( theBitArrayCopy.any(), 'an unexpected index(es) were True, they should be false: ' + str( theBitArrayCopy ) )

    def testCheckWord(self):
        self.myHasherForTest.myTestResults[ 'foo' ] = [ 4, 5 ]
        self.myHasherForTest.myTestResults[ 'bar' ] = [ 2, 7, 5 ]
        self.myBloomFilter.addWord( 'foo' )

        self.assertTrue( self.myBloomFilter.checkWord( 'foo' ), 'word foo should be in the filter' )
        self.assertFalse( self.myBloomFilter.checkWord( 'bar' ), 'word bar should NOT be in the filter' )

    def testCheckWordWithRealHasher(self):
        theBloomFilter = BloomFilter()

        self.validateAddingWord( theBloomFilter, "foo" );
        self.validateAddingWord( theBloomFilter, "bar" );
        self.validateAddingWord( theBloomFilter, "barf" );
        self.validateAddingWord( theBloomFilter, "barge" );
        self.validateAddingWord( theBloomFilter, "barn" );
        self.validateAddingWord( theBloomFilter, "bart" );
        self.validateAddingWord( theBloomFilter, "fnarfle-pants" );
        self.validateAddingWord( theBloomFilter, "BLARG" );
        self.validateAddingWord( theBloomFilter, "blarg" );
        self.validateAddingWord( theBloomFilter, "a" );
        self.validateAddingWord( theBloomFilter, "aardvark" );
        self.validateAddingWord( theBloomFilter, "platypus" );
        self.validateAddingWord( theBloomFilter, "melee" );
        self.validateAddingWord( theBloomFilter, "somethingreallylong" );
        self.validateAddingWord( theBloomFilter, "carrot" );
        self.validateAddingWord( theBloomFilter, "derpa derpa der" );
        self.validateAddingWord( theBloomFilter, "b" );
        self.validateAddingWord( theBloomFilter, "#winning" );

        self.assertFalse( theBloomFilter.checkWord( "bat" ));
        self.assertFalse( theBloomFilter.checkWord( "mele" ));
        self.assertFalse( theBloomFilter.checkWord( "blah" ));

    def validateAddingWord(self, aBloomFilter, aWordToTest):
        self.assertFalse( aBloomFilter.checkWord( aWordToTest ) );
        aBloomFilter.addWord( aWordToTest );
        self.assertTrue( aBloomFilter.checkWord( aWordToTest ) );
 def setUp( self ):
     self.myHasherForTest = HasherForTest()
     self.myBloomFilter = BloomFilter( self.myHasherForTest )
示例#40
0
文件: unit.py 项目: docete/tomoko
 def setUp(self):
     self.bf = BloomFilter(262144, 14)
     lines = open("/usr/share/dict/american-english").read().splitlines()
     for line in lines:
         self.bf.update(line)
示例#41
0
文件: Start.py 项目: hrcp/bloomfilter
from BloomFilter import BloomFilter
import sys
import time
#import resource

if(len(sys.argv)!=4):
    print "Invalid input arguments! The number of them is not right!"
    exit(0)

bloom=BloomFilter()
try:
    input_file=open(sys.argv[1],"r")
    output_file=open(sys.argv[2],"a+")
    p=float(sys.argv[3])

except IOError:
    print "Invalid input arguments! Check that the input file exists!"
    exit(0)
except ValueError as e:
    print "Third argument not a float number!"
    exit(0)
lines=input_file.readlines()
bloom.initialize_using_np(len(lines),float(sys.argv[3]))

start_time=time.time()
for line in lines:
    line=line.strip().rstrip()
    bloom.add(line)

input_file.close()
from BloomFilter import BloomFilter

bf = BloomFilter()

bf.add("python")
bf.add("vk")

assert(bf.check("python") == True)
assert(bf.check("vk") == True)
assert(bf.check("kaboom") == False)
 def __init__(self):
     self.myBloomFilter = BloomFilter()
示例#44
0
class Strategy(object):
	"""爬取策略"""
	def __init__(self, name,path,switch=False,solr=None):
		self.name=name
		self.switch=switch
		self.path=path+'\logs\\'+self.name
		if not os.path.exists(path+'\logs'):
			os.mkdir(path+'\logs')
		if not os.path.exists(self.path):
			os.mkdir(self.path)
		self.routor=Routor(name,path)
		self.queue=Queue.Queue(maxsize=0)
		self.failQueue=Queue.Queue(maxsize=0)#失败队列
		self.logger=self.newLogging(name)
		self.bloomfilter=BloomFilter()
		self.count=0
		self.queue.put(self.routor.route[0]['pattern'])
		self.sleeptime=self.routor.route[-1]['sleeptime']
		self.block=SleepTime(self.sleeptime)#屏蔽模块
		self.fail=0
		self.job=deal(name,'job',path,solr)
		self.company=deal(name,'company',path,solr)

	def newLogging(self,name):
		logger = logging.getLogger(name)
		logger.setLevel(logging.DEBUG)
		# 创建一个handler,用于写入日志文件
		fh = logging.FileHandler(self.path+'\\'+name+'.log')
		fh.setLevel(logging.DEBUG)
		# 再创建一个handler,用于输出到控制台
		ch = logging.StreamHandler()
		ch.setLevel(logging.DEBUG)
		# 定义handler的输出格式
		formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
		fh.setFormatter(formatter)
		ch.setFormatter(formatter)
		# 给logger添加handler
		logger.addHandler(fh)
		logger.addHandler(ch)
		return logger
	"""功能函数包装"""
	def link_and_check(func):
		def _wrapper(*args,**kw):
			url=unicode(args[1])
			headers ={'Accept':'text/html;q=0.9,*/*;q=0.8','Accept-Charset':'ISO-8859-1,utf-8,gb2312;q=0.7,*;q=0.3','Accept-Encoding':'gzip','Connection':'close','Referer':None}
			headers['User-Agent']=getHeader()
			#注意如果依然不能抓取的话,Referer可以设置抓取网站的host
			try:
				try:
					req=requests.get(url,timeout=5,headers=headers)
				except Exception as e:
					raise FailException(args[0],'bad requests:'+str(type(e))[8:])
				if req:
					if req.content:
						kw['content']=req.content
						return func(*args,**kw)
				else:
					args[0].logger.warning('No Content in URL: %s'%url)
					raise FailException(args[0],'No Content in URL')
			except FailException as e:
				args[0].logger.warning('URL: %s | info: %s'%(url,e.info))
				args[0].logger.warning('fail: %s | Stime: %s'%(args[0].fail,args[0].sleeptime))
				if args[0].switch:
					print 'put in failQueue'
					args[0].failQueue.put(url)
			finally:
				if args[0].switch:
					tim=args[0].block.isBlocked(args[0].fail)
					if isinstance(tim,tuple):
						if tim[1]:
							#学习停止
							args[0].switch=False
							args[0].routor.setST(tim[0])
							tim=tim[0]
					args[0].sleeptime=tim
					print args[0].sleeptime
					if args[0].fail==0 or not args[0].switch:
						if not args[0].failQueue.empty():
							for x in range(args[0].failQueue.qsize()):
								u=args[0].failQueue.get()
								args[0].queue.put(u)
		return _wrapper
	"""功能函数"""
	@link_and_check
	def enter(self,url,**kw):
		#处理需要进入并且获取网页指定区域子连接的URL
		text=kw['content']
		area=self.getArea(text,kw['loc'])
		linklist=self.getAllAch(area)
		for link in linklist:
			if not self.bloomfilter.isContain(link):
				self.queue.put(link)
				self.bloomfilter.insert(link)
		self.fail=0

	@link_and_check
	def need(self,url,**kw):
		#处理目标页面的文本信息,直接下载到本地
		text=kw['content']
		if kw['ctg']=='job':
			filename='\job_save.log'
			self.job.txt=text
			self.job.url=url
			forsave= self.job.send('update')[0]
		else:
			filename='\company_save.log'
			self.company.txt=text
			self.company.url=url
			forsave= self.company.send('update')[0]
		with open(self.path+filename,'a') as f:
			f.write(forsave)

		self.fail=0


	def auto(self,url,**kw):
		#处理需要调用URLgenerator的URL
		self.logger.warning('BEGIN USING ATUO generator!')
		self.routor.match(url,submodel=True)
		if len(kw['replace'])==2:
			replace=[str(n) for n in xrange(kw['replace'][0],kw['replace'][1])]
		else:
			replace=kw['replace']

		for x in replace:
			u=urlGenerator(url,kw['between'],x)
			if not self.bloomfilter.isContain(u):
				self.distributor(u)
				self.bloomfilter.insert(u)
		self.routor.match(url,submodel=False)
				
	"""策略核心"""
	def core(self):
		ti=time.time()
		isFinish=False #退出判定
		t=time.time()-ti #计时退出

		# try:
		# 	while not isFinish:
		# 	# size=self.queue.qsize()
		# 	# self.logger.info('before get url, Queue size = %s'%size)
		# 		url=self.queue.get()
		# 		self.distributor(url)
		# 		t=time.time()-ti
		# 		if t>3600:
		# 			isFinish=True
		# 			print 'COUNTE = ',self.count
		# 			self.logger.info('COUNT = %s'%self.count)
		# except:
		# 	print 'FINISH ! In Time:',t
		# 	print self.queue.qsize()
		# 	self.logger.info('FINISH ! In Time: %s'%t)

		while not isFinish:
			url=self.queue.get()
			self.distributor(url)

			#退出机制,测试用
			# t=time.time()-ti
			# if t>3600:
			# 	isFinish=True
			# 	print 'COUNTE = ',self.count
			# 	self.logger.info('COUNT = %s'%self.count)

		print 'FINISH ! In Time:',t
		print self.queue.qsize()
		self.logger.info('FINISH ! In Time: %s'%t)

	def distributor(self,url):
		#分发链接
		afterRoute=self.routor.match(url)
		if afterRoute:
			self.count+=1
			self.logger.info('%s: %s'%(afterRoute['model'],url))
			if afterRoute['model']=='enter':
				self.enter(url,**afterRoute['args'])
			elif afterRoute['model']=='need':
				self.need(url,**afterRoute['args'])
			elif afterRoute['model']=='auto':
				self.auto(url,**afterRoute['args'])
			# time.sleep(self.sleeptime)
		else:
			self.logger.warning('URL: %s is not found in Pattern !'%url)

	"""工具方法"""
	def getArea(self,text,loc):
		#获取指定文本之间的文本
		for k,v in loc.iteritems():
			l=[k,v]
			t=getContent(text,l)
			if t:
				return t
		print text
		raise FailException(self,'No Area is Done')

	def getAllAch(self,text):
		#获取指定文本中的链接,并查重,返回list
		soup=BeautifulSoup(text)
		linklist=[link.get('href') for link in soup.find_all('a')]
		if len(linklist)==0:
			raise FailException(self,'No link in content')
		legallink=[]
		for link in linklist:
			link=str(link)
			if re.match(r'http://.*',link):
				legallink.append(link)
		linklist=legallink
		for script in soup.find_all('script'):
			scr=str(script)
			r=re.findall(r'"http://.*?"',scr)
			for sc in r:
				if sc:
					rs=re.search(r'"http://.*?"',sc)
					if rs:
						l=rs.group().replace('"','')
						linklist.append(l)
		return linklist
from BloomFilter import BloomFilter
import random
import string

def removeMatchingWord( aWordToFind, aWords ):
    if aWordToFind in aWords:
        aWords.remove( aWordToFind )

if __name__ == '__main__':
    theBloomFilter = BloomFilter()
    [theBloomFilter.addWord( line.strip() ) for line in open('/usr/share/dict/words')]

    theWrongWords = []
    theCountOfRandomWords = 0

    while len(theWrongWords) < 300:
        theCountOfRandomWords = theCountOfRandomWords + 1
        theRandomString = ''.join(random.choice(string.ascii_lowercase) for x in range(5))

        if ( theBloomFilter.checkWord( theRandomString ) ):
            theWrongWords.append( theRandomString )

    print '# random words checked: ' + str( theCountOfRandomWords )

    [ removeMatchingWord( line.strip(), theWrongWords ) for line in open('/usr/share/dict/words')]

    print '# false positives: ' + str( len( theWrongWords ) )

    print 'false positive rate: ' + str( 100.0 * float( len( theWrongWords ) ) / float( theCountOfRandomWords ) )