Пример #1
0
def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport,
                                freqSet):
    _itemSet = set()
    localSet = defaultdict(int)

    def set2Str(cdd):
        return "_".join(cdd)

    filterCdd = cuckoofilter.CuckooFilter(capacity=len(itemSet),
                                          fingerprint_size=1)
    print("Store cdds in CF ... - %s" % getTime())
    for val in itemSet:
        filterCdd.insert(set2Str(val))
    print("Mapping cddFromTrans on CF ... - %s" % getTime())
    for trans in transactionList:  #zi 20161112耗时,如果cdd仅几个,依然要全扫描所有trans并每条计算。
        for cdd in combinations(trans, lenItem):
            cdd = frozenset(cdd)
            if filterCdd.contains(set2Str(cdd)):
                freqSet[cdd] += 1  #zi 全局存一个
                localSet[cdd] += 1  #zi 局部存一个,(item, count),然后过滤小于minSupport的。
    print("Filter cdds that less than minSup. - %s" % getTime())
    for item, count in localSet.items():
        support = float(count) / len(transactionList)
        if support > minSupport:
            _itemSet.add(item)

    return _itemSet
Пример #2
0
def main():
    total_items = 100000
    cf = cuckoofilter.CuckooFilter(total_items, 2)

    num_inserted = 0
    for i in range(total_items):
        cf.insert(str(i))
        num_inserted = num_inserted + 1

    for i in range(num_inserted):
        assert cf.contains(str(i))

    total_queries = 0
    false_queries = 0
    for i in range(total_items, 2 * total_items):
        if cf.contains(str(i)):
            false_queries = false_queries + 1
        total_queries = total_queries + 1

    serialized = cf.serialize().read()

    print('False positive rate is {:%}'.format(false_queries / total_queries))
    print("size after serialize: {:}".format(len(serialized)))
    print("size after serialize + gzip: {:}".format(
        len(gzip.compress(serialized))))
def returnItemsWithMinSupportV4(itemSet, lenItem, transactionList, minSupport,
                                freqSet):
    _itemSet = set()
    localSet = defaultdict(int)

    def set2Str(cdd):
        return "_".join(cdd)

    filterCdd = cuckoofilter.CuckooFilter(capacity=len(itemSet),
                                          fingerprint_size=1)
    for val in itemSet:
        filterCdd.insert(set2Str(val))

    for trans in transactionList:
        for cdd in combinations(trans, lenItem):
            cdd = frozenset(cdd)
            if filterCdd.contains(set2Str(cdd)):
                freqSet[cdd] += 1  #zi 全局存一个
                localSet[cdd] += 1  #zi 局部存一个,(item, count),然后过滤小于minSupport的。

    for item, count in localSet.items():
        support = float(count) / len(transactionList)
        if support > minSupport:
            _itemSet.add(item)

    return _itemSet
Пример #4
0
 def bake(self):
     old = self._words
     self._words = []
     print("#adding unique words...")
     unique = {}
     for word in old:
         word_lower = word.lower().strip()
         #if word_lower not in self._words:
         if " " in word_lower:
             print("#WARNING in bake: " + str(word_lower) +
                   "has spaces so splitting")
             for subw in word_lower.split(" "):
                 subw_strip = subw.strip()
                 if len(subw_strip) > 0:
                     unique[subw_strip] = True
         else:
             unique[word_lower] = True
     self._words = list(unique)
     self._w_dict = unique
     self._words = sorted(self._words)
     self._words_set = set(self._words)
     print("#using " + str(len(self._words)) + " words")
     if cf_enable:
         self._cf = cuckoofilter.CuckooFilter(capacity=len(self._words),
                                              fingerprint_size=1)
         print("# Filling cuckoo filter...")
         for word in self._words:
             self._cf.insert(word)
         print("#   done (cuckoo filter ready)")
         self.spell = self.spell_cuckoo
     else:
         # self.spell = self.spell_bisect
         # print("#SpellFake is using method: spell_bisect")
         self.spell = self.spell_d_hash
         print("#SpellFake is using method: spell_d_hash")
     tmp_name = "prev-NoDictAnagram-list.txt"
     outs = open(tmp_name, 'w')
     for word in self._words:
         outs.write(word + "\n")
     outs.close()
     print("#wrote " + tmp_name)
     self._baked = True
Пример #5
0
    def acf_create(self, rho, c_for_acf):
        """
        this part is set to generate the cuckoofilter which is used for save the encrypted mapping relation
        """

        cfList = [cuckoofilter.CuckooFilter(self.capacity, self.bucketSize, self.fingerprint_size)
                  for i in range(self.maxCFNumber)]  # initialize the cuckoofilter
        # this part is used for inserting the mapping relation into the cuckoofilter
        for attr, row in rho.items():
            # translate the attr and row to the int, because the next step:'{:032b}'.format(x) request a int input
            attr_str = int(attr)
            row_str = int(row)
            attr_str = '{:032b}'.format(attr_str)  # set the attr_str'length as 32
            row_str = '{:032b}'.format(row_str)  # set the row_str'length as 32
            attr_row_str = attr_str + row_str  # splice attr_str and row_str together
            value_for_xor = ""  # initialize the value_for_xor which is used for saving the result
            # xor the m_for_xor with the attr_row_str
            for i in range(len(c_for_acf)):
                bit = int(list(c_for_acf)[i]) ^ int(list(attr_row_str)[i])
                value_for_xor += str(bit)
            # after xor wo put the result which is value_for_xor into the cuckoofilter
            fingerprint = hashutils.fingerprint(attr, self.fingerprint_size)
            cuckoofilter.new_antiCollisionInsert(cfList=cfList, fingerprint=fingerprint, value=value_for_xor)
        return cfList
Пример #6
0
def cf():
    return cuckoofilter.CuckooFilter(1000, 4)
Пример #7
0
'''
Example usage. Modeled after https://github.com/efficient/cuckoofilter/blob/master/example/test.cc
'''

import cuckoofilter

if __name__ == '__main__':
    total_items = 100000
    cf = cuckoofilter.CuckooFilter(total_items, 2)

    num_inserted = 0
    for i in range(total_items):
        cf.insert(str(i))
        num_inserted = num_inserted + 1

    for i in range(num_inserted):
        assert cf.contains(str(i))

    total_queries = 0
    false_queries = 0
    for i in range(total_items, 2 * total_items):
        if cf.contains(str(i)):
            false_queries = false_queries + 1
        total_queries = total_queries + 1

    print('False positive rate is {:%}'.format(false_queries / total_queries))
Пример #8
0
    def findCollisionCuckoo(self, hashPart=None, filterCapacity=10000000):
        """
        Using Cuckoo filter, large

        :param hashPart: the input hash loaded from a file
        :param filterCapacity: the capacity of Bloom filter
        :return: inputString, inputHash, totalTime, indexOfFirst, indexOfLast,
        newHashPart, indexOfLast-indexOfFirst, firstTemp, lastTemp, totalMemory

        """
        try:
            if not hashPart:
                hashPart = self.hashPart
                hashPartLength = self.hashPartLength
            else:
                hashPartLength = len(hashPart)

            if '.txt' not in str(self.inputFile):
                inputString = self.inputFile
            else:
                inputString = ''

            status = 0
            indexOfFirst = 0
            firstTemp = ''
            indexOfLast = 0
            lastTemp = ''

            newHashPart = bytes(hashPart, 'utf-8')
            cf = cuckoofilter.CuckooFilter(
                capacity=filterCapacity, fingerprint_size=1)
            start = timeit.default_timer()

            while True:
                if newHashPart not in cf:
                    cf.insert(newHashPart)
                    indexOfLast += 1
                    status += 1
                    if status == 10000000:
                        print('\n' * 100)
                        print('Count of cycles:', indexOfLast)
                        print('Run time:', round(
                            (timeit.default_timer() - start), 3), 's')
                        status = 0
                    lastTemp = newHashPart.decode('utf-8')
                    newHash = hashlib.sha256(newHashPart).hexdigest()
                    newHash = newHash[0:hashPartLength]
                    newHashPart = bytes(newHash, 'utf-8')
                else:
                    if indexOfLast >= filterCapacity:
                        print("!!! filterCapacity reached !!!")
                        break
                    print("### Potencional collision successfully passed! ###")
                    print("Suspicious hash: ", newHash)
                    print('Count of cycles:', indexOfLast)
                    print('Time:', round((timeit.default_timer() - start), 3), 's')

                    indexOfFirst = 0
                    collisionHash = newHashPart
                    newHashPart = bytes(hashPart, 'utf-8')
                    while newHashPart != collisionHash:
                        indexOfFirst += 1
                        status += 1
                        if status == 10000000:
                            print('\n' * 100)
                            print(
                                'Suspicious hash found! :) Searching for collision index...')
                            print('Count of cycles:', indexOfFirst)
                            print('Run time:', round(
                                (timeit.default_timer() - start), 3), 's')
                            status = 0
                        firstTemp = newHashPart.decode('utf-8')
                        newHash = hashlib.sha256(newHashPart).hexdigest()
                        newHash = newHash[0:hashPartLength]
                        newHashPart = bytes(newHash, 'utf-8')

                    if indexOfFirst != indexOfLast:
                        break
                    else:
                        print('False positive hash detected :(')
                        indexOfLast += 1
                        status += 1
                        cf.insert(newHashPart)
                        newHash = hashlib.sha256(newHashPart).hexdigest()
                        newHash = newHash[0:hashPartLength]
                        newHashPart = bytes(newHash, 'utf-8')

            stop = timeit.default_timer()
            totalTime = round(stop - start, 12)
            totalMemory = round(sys.getsizeof(cf) / 1048576, 3)

            if indexOfFirst != indexOfLast and filterCapacity > indexOfLast:
                print(
                    '\n\n##### findCollisionCuckoo - Collision found process succeeded! \o/ #####\n')
                print("Collision found after %s seconds" % (totalTime), '\n')
                if inputString:
                    print('Input string:', inputString)
                print('Input hashPart:', hashPart)
                print('\nCollision hash:', newHash)
                print('Hash 1 leading to collision:', firstTemp)
                print('Hash 2 leading to collision:', lastTemp)
                print('\nIndex of first collision:', indexOfFirst)
                print('Index of last collision:', indexOfLast)
                print('Cycles between collision hashes:',
                      indexOfLast - indexOfFirst)
                print('\nCuckoo filter used', round(
                    sys.getsizeof(cf) / 1024 / 1024, 3), 'MB')

                return {
                    "inputString": inputString,
                    "inputHash": hashPart,
                    "time": totalTime,
                    "indexOfFirst": indexOfFirst,
                    "indexOfLast": indexOfLast,
                    "collisionHash": newHashPart,
                    "cyclesBetCol": indexOfLast -
                    indexOfFirst,
                    "firstTemp": firstTemp,
                    "lastTemp": lastTemp,
                    "dataStructConsum": totalMemory}
            else:
                print(
                    '\n##### findCollisionCuckoo - Collision found process failed! /o\ #####')

        except Exception as e:
            print(str(e))