def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) def set2Str(cdd): return "_".join(cdd) filterCdd = cuckoofilter.CuckooFilter(capacity=len(itemSet), fingerprint_size=1) print("Store cdds in CF ... - %s" % getTime()) for val in itemSet: filterCdd.insert(set2Str(val)) print("Mapping cddFromTrans on CF ... - %s" % getTime()) for trans in transactionList: #zi 20161112耗时,如果cdd仅几个,依然要全扫描所有trans并每条计算。 for cdd in combinations(trans, lenItem): cdd = frozenset(cdd) if filterCdd.contains(set2Str(cdd)): freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 print("Filter cdds that less than minSup. - %s" % getTime()) for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
def main(): total_items = 100000 cf = cuckoofilter.CuckooFilter(total_items, 2) num_inserted = 0 for i in range(total_items): cf.insert(str(i)) num_inserted = num_inserted + 1 for i in range(num_inserted): assert cf.contains(str(i)) total_queries = 0 false_queries = 0 for i in range(total_items, 2 * total_items): if cf.contains(str(i)): false_queries = false_queries + 1 total_queries = total_queries + 1 serialized = cf.serialize().read() print('False positive rate is {:%}'.format(false_queries / total_queries)) print("size after serialize: {:}".format(len(serialized))) print("size after serialize + gzip: {:}".format( len(gzip.compress(serialized))))
def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport, freqSet): _itemSet = set() localSet = defaultdict(int) def set2Str(cdd): return "_".join(cdd) filterCdd = cuckoofilter.CuckooFilter(capacity=len(itemSet), fingerprint_size=1) for val in itemSet: filterCdd.insert(set2Str(val)) for trans in transactionList: for cdd in combinations(trans, lenItem): cdd = frozenset(cdd) if filterCdd.contains(set2Str(cdd)): freqSet[cdd] += 1 #zi 全局存一个 localSet[cdd] += 1 #zi 局部存一个,(item, count),然后过滤小于minSupport的。 for item, count in localSet.items(): support = float(count) / len(transactionList) if support > minSupport: _itemSet.add(item) return _itemSet
def bake(self): old = self._words self._words = [] print("#adding unique words...") unique = {} for word in old: word_lower = word.lower().strip() #if word_lower not in self._words: if " " in word_lower: print("#WARNING in bake: " + str(word_lower) + "has spaces so splitting") for subw in word_lower.split(" "): subw_strip = subw.strip() if len(subw_strip) > 0: unique[subw_strip] = True else: unique[word_lower] = True self._words = list(unique) self._w_dict = unique self._words = sorted(self._words) self._words_set = set(self._words) print("#using " + str(len(self._words)) + " words") if cf_enable: self._cf = cuckoofilter.CuckooFilter(capacity=len(self._words), fingerprint_size=1) print("# Filling cuckoo filter...") for word in self._words: self._cf.insert(word) print("# done (cuckoo filter ready)") self.spell = self.spell_cuckoo else: # self.spell = self.spell_bisect # print("#SpellFake is using method: spell_bisect") self.spell = self.spell_d_hash print("#SpellFake is using method: spell_d_hash") tmp_name = "prev-NoDictAnagram-list.txt" outs = open(tmp_name, 'w') for word in self._words: outs.write(word + "\n") outs.close() print("#wrote " + tmp_name) self._baked = True
def acf_create(self, rho, c_for_acf): """ this part is set to generate the cuckoofilter which is used for save the encrypted mapping relation """ cfList = [cuckoofilter.CuckooFilter(self.capacity, self.bucketSize, self.fingerprint_size) for i in range(self.maxCFNumber)] # initialize the cuckoofilter # this part is used for inserting the mapping relation into the cuckoofilter for attr, row in rho.items(): # translate the attr and row to the int, because the next step:'{:032b}'.format(x) request a int input attr_str = int(attr) row_str = int(row) attr_str = '{:032b}'.format(attr_str) # set the attr_str'length as 32 row_str = '{:032b}'.format(row_str) # set the row_str'length as 32 attr_row_str = attr_str + row_str # splice attr_str and row_str together value_for_xor = "" # initialize the value_for_xor which is used for saving the result # xor the m_for_xor with the attr_row_str for i in range(len(c_for_acf)): bit = int(list(c_for_acf)[i]) ^ int(list(attr_row_str)[i]) value_for_xor += str(bit) # after xor wo put the result which is value_for_xor into the cuckoofilter fingerprint = hashutils.fingerprint(attr, self.fingerprint_size) cuckoofilter.new_antiCollisionInsert(cfList=cfList, fingerprint=fingerprint, value=value_for_xor) return cfList
def cf(): return cuckoofilter.CuckooFilter(1000, 4)
''' Example usage. Modeled after https://github.com/efficient/cuckoofilter/blob/master/example/test.cc ''' import cuckoofilter if __name__ == '__main__': total_items = 100000 cf = cuckoofilter.CuckooFilter(total_items, 2) num_inserted = 0 for i in range(total_items): cf.insert(str(i)) num_inserted = num_inserted + 1 for i in range(num_inserted): assert cf.contains(str(i)) total_queries = 0 false_queries = 0 for i in range(total_items, 2 * total_items): if cf.contains(str(i)): false_queries = false_queries + 1 total_queries = total_queries + 1 print('False positive rate is {:%}'.format(false_queries / total_queries))
def findCollisionCuckoo(self, hashPart=None, filterCapacity=10000000): """ Using Cuckoo filter, large :param hashPart: the input hash loaded from a file :param filterCapacity: the capacity of Bloom filter :return: inputString, inputHash, totalTime, indexOfFirst, indexOfLast, newHashPart, indexOfLast-indexOfFirst, firstTemp, lastTemp, totalMemory """ try: if not hashPart: hashPart = self.hashPart hashPartLength = self.hashPartLength else: hashPartLength = len(hashPart) if '.txt' not in str(self.inputFile): inputString = self.inputFile else: inputString = '' status = 0 indexOfFirst = 0 firstTemp = '' indexOfLast = 0 lastTemp = '' newHashPart = bytes(hashPart, 'utf-8') cf = cuckoofilter.CuckooFilter( capacity=filterCapacity, fingerprint_size=1) start = timeit.default_timer() while True: if newHashPart not in cf: cf.insert(newHashPart) indexOfLast += 1 status += 1 if status == 10000000: print('\n' * 100) print('Count of cycles:', indexOfLast) print('Run time:', round( (timeit.default_timer() - start), 3), 's') status = 0 lastTemp = newHashPart.decode('utf-8') newHash = hashlib.sha256(newHashPart).hexdigest() newHash = newHash[0:hashPartLength] newHashPart = bytes(newHash, 'utf-8') else: if indexOfLast >= filterCapacity: print("!!! filterCapacity reached !!!") break print("### Potencional collision successfully passed! ###") print("Suspicious hash: ", newHash) print('Count of cycles:', indexOfLast) print('Time:', round((timeit.default_timer() - start), 3), 's') indexOfFirst = 0 collisionHash = newHashPart newHashPart = bytes(hashPart, 'utf-8') while newHashPart != collisionHash: indexOfFirst += 1 status += 1 if status == 10000000: print('\n' * 100) print( 'Suspicious hash found! :) Searching for collision index...') print('Count of cycles:', indexOfFirst) print('Run time:', round( (timeit.default_timer() - start), 3), 's') status = 0 firstTemp = newHashPart.decode('utf-8') newHash = hashlib.sha256(newHashPart).hexdigest() newHash = newHash[0:hashPartLength] newHashPart = bytes(newHash, 'utf-8') if indexOfFirst != indexOfLast: break else: print('False positive hash detected :(') indexOfLast += 1 status += 1 cf.insert(newHashPart) newHash = hashlib.sha256(newHashPart).hexdigest() newHash = newHash[0:hashPartLength] newHashPart = bytes(newHash, 'utf-8') stop = timeit.default_timer() totalTime = round(stop - start, 12) totalMemory = round(sys.getsizeof(cf) / 1048576, 3) if indexOfFirst != indexOfLast and filterCapacity > indexOfLast: print( '\n\n##### findCollisionCuckoo - Collision found process succeeded! \o/ #####\n') print("Collision found after %s seconds" % (totalTime), '\n') if inputString: print('Input string:', inputString) print('Input hashPart:', hashPart) print('\nCollision hash:', newHash) print('Hash 1 leading to collision:', firstTemp) print('Hash 2 leading to collision:', lastTemp) print('\nIndex of first collision:', indexOfFirst) print('Index of last collision:', indexOfLast) print('Cycles between collision hashes:', indexOfLast - indexOfFirst) print('\nCuckoo filter used', round( sys.getsizeof(cf) / 1024 / 1024, 3), 'MB') return { "inputString": inputString, "inputHash": hashPart, "time": totalTime, "indexOfFirst": indexOfFirst, "indexOfLast": indexOfLast, "collisionHash": newHashPart, "cyclesBetCol": indexOfLast - indexOfFirst, "firstTemp": firstTemp, "lastTemp": lastTemp, "dataStructConsum": totalMemory} else: print( '\n##### findCollisionCuckoo - Collision found process failed! /o\ #####') except Exception as e: print(str(e))