예제 #1
0
def test_hash128():
    assert mmh3.hash128('foo') == 168394135621993849475852668931176482145
    assert mmh3.hash128('foo', 42) == 215966891540331383248189432718888555506
    assert mmh3.hash128(
        'foo', 42, signed=False) == 215966891540331383248189432718888555506
    assert mmh3.hash128(
        'foo', 42, signed=True) == -124315475380607080215185174712879655950
예제 #2
0
    def generate_table(self, item_ids):
        """
        Given a list of item IDs, generate a corresponding IBLT
        Args:
            item_ids(list): A list of IDs for items to be included in IBLT.

        Returns:
            list: An invertible bloom lookup table in format list of lists.
        """
        bloom = [(0, 0, 0)] * self.m
        for item in item_ids:
            hash_values = []
            for seed in self.seed_list:
                hash_values.append(mmh3.hash128(str(item).encode(), seed))
            for hash_value in hash_values:
                index = hash_value % self.m
                id_sum = bloom[index][0] ^ item
                if bloom[index][1] == 0:
                    hash_sum = mmh3.hash128(
                        str(item).encode(), self.element_hash)
                else:
                    hash_sum = bloom[index][1] ^ mmh3.hash128(
                        str(item).encode(), self.element_hash)
                count = bloom[index][2] + 1
                bloom[index] = (id_sum, hash_sum, count)
        return bloom
예제 #3
0
    def peel_element(self, element_id, table, alteration):
        """
        Peels a single element from a given IBLT.
        
        Args:
            element_id(int): The element to be peeled.
            table(list): The invertible bloom lookup table.
            alteration(int): The indicator as to which list this element was stored in (1 OR -1)

        Returns:
            list:
                An updated invertible bloom lookup table with the given element removed.
        """
        hash_values = []
        element_hash = mmh3.hash128(
            str(element_id).encode(), self.element_hash)
        for seed in self.seed_list:
            hash_values.append(mmh3.hash128(str(element_id).encode(), seed))
        for hash_value in hash_values:
            index = hash_value % self.m
            id_sum = table[index][0] ^ element_id
            if table[index][1] == 0:
                hash_sum = element_hash
            else:
                hash_sum = table[index][1] ^ element_hash
            count = table[index][2] - alteration
            table[index] = (id_sum, hash_sum, count)
        return table
예제 #4
0
    def generate_table(item_ids,
                       seed_key,
                       table_size=_M,
                       max_hashes=MAX_HASHES,
                       a_value=DEFAULT_A_VALUE,
                       hash_decider=None,
                       hash_decider_length=MAX_RANDOM_HASHES,
                       seed_range=MAX_RANDOM_HASHES):
        """
        Generate the randomized hash function quantity based IBLT

        Args:
            a_value: The value for a in the ALOHA style distribution function.
            item_ids: The IDs of the items to be inserted.
            seed_key: Shared key to instantiate hash functions.
            table_size: Size of the IBLT.
            max_hashes: Upper bound for total hashes to be used.
            hash_decider(list[int]): List of random numbers for hashing iterations.
            hash_decider_length: Size of the list of random numbers determining the amount of times an item is added.
            seed_range: The upper bound of the values of any given seed key.

        Returns:
            tuple[list[tuple], list[int], list[int]]: An IBLT as a list of tuples, each element is of the form (idSum, hashSum, count).
        """
        bloom = [(0, 0, 0)] * table_size
        if hash_decider is None:
            hash_decider = IBLT.generate_hash_decider(seed_key, max_hashes,
                                                      a_value,
                                                      hash_decider_length)
        seed_list = IBLT.generate_seed_list(seed_key, max_hashes, seed_range)
        for item in item_ids:
            item_hash = mmh3.hash128(str(item).encode(), seed_key)

            hash_quantity = hash_decider[item_hash % len(hash_decider)]
            hash_values = []
            # Calculate hash values for the item and derive the index for encoding
            for i in range(hash_quantity):
                hash_values.append(
                    mmh3.hash128(str(item).encode(), seed_list[i]))
            for hash_value in hash_values:
                index = hash_value % table_size
                id_sum = bloom[index][0] ^ item
                if bloom[index][1] == 0:
                    hash_sum = item_hash
                else:
                    hash_sum = bloom[index][1] ^ item_hash
                count = bloom[index][2] + 1
                bloom[index] = (id_sum, hash_sum, count)
        return bloom, seed_list, hash_decider
예제 #5
0
def lemmatizeFile(file):
    f = open(file, 'r')
    # wnl = WordNetLemmatizer()
    wordsHashMap = {}
    words = {}
    word = ""

    while 1:
        c = f.read(1)
        if c:
            if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or (c == '\''):
                word += c
            else:
                if word:
                    word = word.lower()
                    if word not in processedWords:
                        canonical = word
                        if word in exceptions:
                            if word in words:
                                words[word] += 1
                            else:
                                words[word] = 1
                            wordsHashMap[mmh3.hash128(word)] = [word, words[word]]
                        else:
                            if word not in stopWords:
                                # tag = nltk.pos_tag([word])
                                if 1:  # tag[0][1] in tags:
                                    # t = tags[tag[0][1]]
                                    canonical = word  # wnl.lemmatize(word, t)
                                    if canonical in words:
                                        words[canonical] += 1
                                    else:
                                        words[canonical] = 1
                                    wordsHashMap[mmh3.hash128(canonical)] = [canonical, words[canonical]]
                        processedWords[word] = canonical
                    else:
                        if word in words:
                            words[word] += 1
                        else:
                            words[word] = 1
                        wordsHashMap[mmh3.hash128(word)] = [word, words[word]]
                word = ""
        else:
            break
    """
    with open(f.name + '.wordsHashMap', 'w') as outfile:
        json.dump(wordsHashMap, outfile) # """
    return wordsHashMap
예제 #6
0
def get_hash(i, value):
    if (i == 0):
        i = -12345
    #https://stackoverflow.com/questions/11954086/which-hash-functions-to-use-in-a-bloom-filter
    hash1 = hashlib.sha256()
    hash1.update(value.encode('utf-8'))
    return int(hash1.hexdigest(), 16) + (i * mmh3.hash128(value))
예제 #7
0
def hash_vulnerability(data: dict):
    string_hash = []
    for x in ('cve', 'id', 'cwe', 'title'):
        if x in data:
            string_hash.append(data[x])

    return mmh3.hash128("#".join(string_hash))
예제 #8
0
    def _get_bucket_idxes(self, element):
        idxes = []
        # Each i is a seed for a new Universal Hash Fn
        for i in range(self._k):
            idxes.append(mmh3.hash128(element, i) % len(self._bit_array))

        return idxes
예제 #9
0
async def vid(request: Request):
    task_cache = GlobalCache()
    global wtf

    if not wtf:
        loop = asyncio.get_event_loop()
        loop.create_task(fetch_downloader())
        wtf = True

    vid_url = request.form.get('url')

    if type(vid_url) is not str:
        return redirect("/")

    vid_url = vid_url.split("&")[0]

    if vid_url.startswith("http"):
        vid_url = vid_url.rstrip("https://").rstrip("http://")

    url_hash = hash128(vid_url)

    if url_hash not in task_cache:
        task_cache[url_hash] = Task()

        downloader_p = Process(target=download_vid,
                               args=(vid_url, download_q, url_hash))
        downloader_p.start()

    return redirect(f"./task/{url_hash}")
예제 #10
0
    def generate_filter(self, items, seeds=None, m=None):
        """
        Given a number of items, generate a bloom filter.

        Args:
            items(list, dict): A list of items or a single item to be inserted to the filter.
            seeds(list): (Optional) A list of seed values for hashing algorithm.
            m(int): (Optional) Size of bloom filter array.

        Returns:
            bytearray: An array of binary bits representing the bloom filter.

        """
        if type(items) == dict:
            keys = items.keys()
            temp_list = []
            for key in keys:
                temp_string = str(key) + ":" + str(items[key])
                temp_list.append(temp_string)
            items = temp_list
        bloom_filter = bytearray(int(self.m))
        if seeds is None:
            seeds = self.seed_list
        if m is None:
            m = self.m
        for item in items:
            for seed in seeds:
                index = mmh3.hash128(str(item).encode(), seed) % m
                bloom_filter[index] = 1
        return bloom_filter
예제 #11
0
    def _k_smallest_hash(self, document):
        """ Generates a texts minhash signature using k smallest neighbours method.

        Uses a single random hash to simulate a shuffle of each texts shingles.
        Then selecting i smallest minimum hash values for j permutations.

        Faster but less stable than multi hash method.

        Args:
            document (list): List of text shingles.

        Returns:
            list: List of text signatures generated using k smallest neighbours method.

        """
        signature = []
        # Uses a heap to make calculating n smallest values more efficient.
        heapq.heapify(signature)
        if len(document) <= self.permutations:
            raise ValueError(
                'N permutations must not be >= n shingles for k_smallest_values method'
            )
        for shingle in document:
            if self.hash_bits == 64:
                hashed_shingle = mmh3.hash64(shingle, self._hash_seeds)[0]
            elif self.hash_bits == 32:
                hashed_shingle = mmh3.hash(shingle, self._hash_seeds)
            else:
                hashed_shingle = mmh3.hash128(shingle, self._hash_seeds)
            heapq.heappush(signature, hashed_shingle)
        return heapq.nsmallest(self.permutations, signature)
예제 #12
0
    def __init__(self):
        # Define Supported hashes
        hashes = dict()
        hashes['md2'] = lambda x: self._get_md2_hash(x)
        hashes['md4'] = lambda x: self._get_hashlib_hash('md4', x)
        hashes['md5'] = lambda x: hashlib.md5(x).hexdigest()
        hashes['sha'] = lambda x: self._get_hashlib_hash('sha', x)
        hashes['sha1'] = lambda x: hashlib.sha1(x).hexdigest()
        hashes['sha256'] = lambda x: hashlib.sha256(x).hexdigest()
        hashes['sha224'] = lambda x: hashlib.sha224(x).hexdigest()
        hashes['sha384'] = lambda x: hashlib.sha384(x).hexdigest()
        hashes['sha512'] = lambda x: hashlib.sha512(x).hexdigest()
        hashes['sha3_224'] = lambda x: sha3.sha3_224(x).hexdigest()
        hashes['sha3_256'] = lambda x: sha3.sha3_256(x).hexdigest()
        hashes['sha3_384'] = lambda x: sha3.sha3_384(x).hexdigest()
        hashes['sha3_512'] = lambda x: sha3.sha3_512(x).hexdigest()
        hashes['mmh2'] = lambda x: str(mmhash.get_hash(x))
        hashes['mmh2_unsigned'] = lambda x: str(mmhash.get_unsigned_hash(x))
        hashes['mmh3_32'] = lambda x: str(mmh3.hash(x))
        hashes['mmh3_64_1'] = lambda x: str(mmh3.hash64(x)[0])
        hashes['mmh3_64_2'] = lambda x: str(mmh3.hash64(x)[1])
        hashes['mmh3_128'] = lambda x: str(mmh3.hash128(x))
        hashes['ripemd160'] = lambda x: self._get_hashlib_hash('ripemd160', x)
        hashes['whirlpool'] = lambda x: self._get_hashlib_hash('whirlpool', x)
        hashes['blake2b'] = lambda x: pyblake2.blake2b(x).hexdigest()
        hashes['blake2s'] = lambda x: pyblake2.blake2s(x).hexdigest()
        hashes['crc32'] = lambda x: str(zlib.crc32(x))
        hashes['adler32'] = lambda x: str(zlib.adler32(x))

        self._hashes = hashes
        self.hashes_and_checksums = self._hashes.keys()
        self.supported_hashes = HASHES
예제 #13
0
    def _multi_hash(self, document):
        """ Generates a texts minhash signature using multi-hash method.

        Uses i random hashes for j permutations selecting the minimum hash value
        each time to build each texts hash signature.

        Slower but more stable than k smallest hash method.

        Args:
            document (list): List of document shingles.

        Returns:
            list: List of text signatures generated using k smallest neighbours method.

        """
        signature = []
        for seed in np.nditer(self._hash_seeds):
            self._min_value = None
            for shingle in document:
                if self.hash_bits == 64:
                    hash_value = mmh3.hash64(shingle, int(seed))[0]
                elif self.hash_bits == 32:
                    hash_value = mmh3.hash(shingle, int(seed))
                else:
                    hash_value = mmh3.hash128(shingle, int(seed))
                if not self._min_value:
                    self._min_value = hash_value
                elif self._min_value > hash_value:
                    self._min_value = hash_value
            signature.append(self._min_value)
        return signature
예제 #14
0
파일: uptime.py 프로젝트: nburfield/uptime
def validate():
    email_message = ''
    already_sent = None
    if os.path.exists('processed.json'):
        sent_json_file = open('processed.json')
        already_sent = json.load(sent_json_file)
        sent_json_file.close()

    with open('saved.json') as json_file:
        data = json.load(json_file)
        if not already_sent:
            already_sent = {}

        for p in data:
            try:
                with urllib.request.urlopen(p) as response:
                    html = response.read()
                    encoded = base64.b64encode(html)
                    hashed = mmh3.hash128(encoded, 42, signed=True)

                    mark = True
                    if p in already_sent:
                        if already_sent[p]:
                            mark = False

                    if mark:
                        if data[p] != hashed:
                            email_message += "- Failed Hash for: " + str(
                                p) + '\r\n'
                            already_sent[p] = True
            except:
                email_message += "- Failed EXCEPTION for: " + str(p) + '\r\n'

    if email_message != '':
        email_error(email_message, already_sent)
예제 #15
0
def hash_to_bucket(e, B):
    i = mmh3.hash128(str(e))
    p = i / float(2**128)
    for j in range(0, B):
        if j / float(B) <= p and (j + 1) / float(B) > p:
            return j + 1
    return B
예제 #16
0
파일: duffel.py 프로젝트: nellore/duffel
def forward(resource, identifier):
    """ Redirects request for file to direct URL.

        Requires global "paths" dictionary is active. 

        resource: a given resource, like "recount2"
        identifier: relative path to file or directory

        Return value: Flask redirect response object
    """
    # Log all requests, even weird ones
    ip = str(request.headers.get('X-Forwarded-For',
                        request.remote_addr)).split(',')[0].strip()
    print >>_LOGSTREAM, '\t'.join(
        [time.strftime('%A, %b %d, %Y at %I:%M:%S %p %Z'),
             str(mmh3.hash128(ip + 'recountsalt')),
             resource,
             identifier])
    _LOGSTREAM.flush()
    if resource == 'recount':
        # Redirect to IDIES URL in order of descending version
        for i in ['2']: # add versions to precede 2 as they are released
            if identifier.startswith(' '.join(['v', i, '/'])):
                idies_url = '/'.join(
                            ['http://idies.jhu.edu/recount/data', identifier]
                        )
                idies_response = requests.head(idies_url)
                if idies_response.status_code == 200:
                    return redirect(idies_url, code=302)
        # v1 is not explicitly versioned
        idies_url = '/'.join(['http://idies.jhu.edu/recount/data', identifier])
        idies_response = requests.head(idies_url)
        if idies_response.status_code == 200:
            return redirect(idies_url, code=302)
    abort(404)
예제 #17
0
 def _read(self, file_path: str) -> Iterable[Instance]:
     file_path = Path(file_path)
     files = file_path.iterdir()
     exist = 0
     non_exist = 0
     for f in filter(lambda x: x.suffix == '.json', files):
         with open(f) as jfile:
             data = json.load(jfile)
         for article_summary in data:
             url = article_summary['url']
             article_summary['summary'], _ = re.subn(
                 r"(\s?\([^)]*\)\s?)", "", article_summary['summary'])
             result = {'browser': article_summary}
             try:
                 sentences_path = file_path / "../scored_sentences/{:x}.json".format(
                     mmh3.hash128(url))
             except UnicodeError:
                 continue
             if sentences_path.exists():
                 try:
                     with open(sentences_path) as sentence_file:
                         sentences = json.load(sentence_file)
                 except (ValueError, IOError):
                     continue
                 exist += 1
                 result['sentences'] = sentences
                 for sentence, label in sentences['sentences']:
                     yield self.text_to_instance(sentence, label)
             else:
                 non_exist += 1
                 print(exist, non_exist)
             if self.max_files and exist > self.max_files:
                 return
예제 #18
0
 def murmur3_128bit(obj):
     """
     Use murmur3_128bit for bit hash by passing this method:
     hasher=DeepHash.murmur3_128bit
     This hasher is the default hasher.
     """
     obj = obj.encode('utf-8')
     return mmh3.hash128(obj, MURMUR_SEED)
예제 #19
0
 def add(self, item):
     if self.isContain(item):
         return False
     else:
         for i in range(self.numHash):
             bitIndex = long(mmh3.hash128(item, i) % self.size)
             self.bitArray[bitIndex] = 1
         return True
def simple_object(key, value):
    "Create a simple key/value object."
    return {
        "_id": mmh3.hash128(value),
        "_type": key,
        "_tool": TOOL,
        key: value,
    }
예제 #21
0
def get_file_hashes(file_path: Path) -> Iterator[int]:
    files = file_path.iterdir()
    for f in filter(lambda x: x.suffix == '.json', files):
        with open(f) as jfile:
            data = json.load(jfile)
        for article_summary in data:
            url = article_summary['url']
            yield mmh3.hash128(url)
예제 #22
0
 def murmur3_128bit(obj):
     """
     Use murmur3_128bit for bit hash by passing this method:
     hasher=DeepHash.murmur3_128bit
     This hasher is the default hasher.
     """
     obj = obj.encode('utf-8')
     return mmh3.hash128(obj, MURMUR_SEED)
예제 #23
0
def test_64bit():
    if sys.maxsize < (1 << 32):  # Skip this test under 32-bit environments
        return
    a = np.zeros(2**32, dtype=np.int8)
    assert mmh3.hash(a) == -1988950868
    assert mmh3.hash64(a) == (-6319308327427928234, -8156928649350215884)
    assert mmh3.hash128(a) == 189813591698865711411311444615608766294
    assert mmh3.hash_bytes(a) == b'V\x8f}\xad\x8eNM\xa84\x07FU\x9c\xc4\xcc\x8e'
예제 #24
0
def flajolet_martin_algo(k,v, accum,seed,n):
	key = ['MM','OH','SIGH','UM']
	idx = key.index(k)
	hkey = mmh3.hash128(v.lower(), seed)%n
	hkey = format(hkey, 'b')
	trailing_zeros = len(hkey) - len(hkey.rstrip('0'))
	accum.add(pow(2,trailing_zeros))
	return
예제 #25
0
def run_plugin(data: dict) -> List[dict or None]:

    port = data['port']
    domain = data['domain']

    log.info(f"Starting DNS information gathering for domain "
             f"{domain}")

    output_result = "/tmp/result.json"
    if platform.system() == "Darwin":
        binary = 'testssl.sh'
    else:
        binary = 'testssl'

    command = f"{binary} --jsonfile-pretty={output_result} " \
              f"--severity MEDIUM --sneaky -U -S -p " \
              f"{domain}:{port}"

    execution_result = launch_command(command,
                                      callback=(print, log.info),
                                      file_result=output_result)
    # -------------------------------------------------------------------------
    # Finding results
    # -------------------------------------------------------------------------
    json_execution_result = json.loads(execution_result)

    results = []
    for host in json_execution_result['scanResult']:

        # ---------------------------------------------------------------------
        # Recover vulnerabilities
        # ---------------------------------------------------------------------
        for vulnerability in host['vulnerabilities']:

            # -----------------------------------------------------------------
            # Build IP data
            # -----------------------------------------------------------------
            ip = {
                '_type': 'ip',
                'ip': host['ip']
            }
            ip['_id'] = calculate_hash(ip)

            # -----------------------------------------------------------------
            # Build vulnerability data
            # -----------------------------------------------------------------
            v = {
                '_type': 'vulnerability',
                'cve': vulnerability.get('cve', ""),
                'title': vulnerability['id'],
                'description': vulnerability.get('finding', ""),
                'cwe': vulnerability.get('cwe', "")
            }
            v['_id'] = mmh3.hash128(f"{ip['_id']}#{calculate_hash(v)}")

            results.append([ip, v])

    return results
예제 #26
0
    def api(self, reqtype, endpoint, data=None, headers=None, ttl=180, error_msg=None):
        '''(CanvasLMSTool, str, dict or str, dict, int (number of seconds to live), str) -> json

        Return a json object which is the result of a Canvas API call to endpoint, and cache the request for ttl seconds.
        Raise an Exception with error_msg text in case of failure.'''

        endpoint = str(endpoint)
        assert reqtype in ['get', 'post', 'put', 'delete']
        assert isinstance(endpoint, str) and endpoint.startswith('/')
        assert data is None or isinstance(data, dict) or isinstance(data, str)
        assert isinstance(headers, dict) or headers is None
        assert error_msg is None or isinstance(error_msg, str)

        token = self.get_canvas_user()['token']

        error_msg = 'Failed to access Canvas. Location: ' + endpoint if error_msg is None else error_msg 

        if '?' in endpoint:
            endpoint += '&access_token=' + token
        else:
            endpoint += '?access_token=' + token

        if reqtype == 'get':
            key = str('CanvasAPICall_' + str(hash128(endpoint + str(data) + str(headers))))

            try:
                r = MC.get(key)
                if r is not None:
                    return json.loads(r)
            except:
                cherrypy.log('error accessing memcache')

            cherrypy.log('Request for ' + endpoint + ' not cached. Key: ' + key)

        req = getattr(requests, reqtype)

        try:
            content = ''
            r = req(self.canvas_url + endpoint, data=data, headers=headers, verify=False)
            if r.status_code in [401, 403]:
                delete_all_cookies()
                raise cherrypy.HTTPRedirect(LOGOUT_URL)
            if r.status_code != 200:
                content = r.content
        except:
            raise Exception(error_msg + ' ' + str(r.status_code) + ' ' + str(content))

        j = r.json()

        if reqtype == 'get':
            try:
                cherrypy.log('setting ' + key + ' :' + str(j))
                MC.set(key, json.dumps(j), ttl)
                print MC.get(key)
            except:
                pass
        
        return j
예제 #27
0
def hash_all_func(data):
    if isinstance(data, str):
        data = data.encode('ascii')

    c, b = hashlittle2(data, 0, 0)

    v = mmh3.hash128(key=data, x64arch=True)

    return c, (v >> 16) & 0x0000FFFFFFFFFFFF, int(np.int64(np.uint64(v & 0xFFFFFFFFFFFFFFFF)))
예제 #28
0
def test_hashex_murmur():
    assert proxenos.rendezvous.hashex(proxenos.rendezvous.HashMethod.MMH3_32,
                                      'secret') == mmh3.hash('secret')

    assert proxenos.rendezvous.hashex(proxenos.rendezvous.HashMethod.MMH3_64,
                                      'secret') == mmh3.hash64('secret')[0]

    assert proxenos.rendezvous.hashex(proxenos.rendezvous.HashMethod.MMH3_128,
                                      'secret') == mmh3.hash128('secret')
예제 #29
0
파일: dalvik.py 프로젝트: themoep/elsim
    def __init__(self, basic_block, sim):
        self.basic_block = basic_block
        self.buff = ""
        for i in self.basic_block.bb.get_instructions():
            self.buff += dvm.clean_name_instruction(i)
            self.buff += dvm.static_operand_instruction(i)

        self.buff = self.buff.encode('UTF-8')
        self.hash = mmh3.hash128(self.buff)
예제 #30
0
def murmur_hash():
    hash_result = mmh3.hash('google')
    pprint(hash_result)

    hash64_result = mmh3.hash64('amazon')
    pprint(hash64_result)

    hash128_result = mmh3.hash128('HugeHard')
    pprint(hash128_result)
def keyword_object(_type, **kwargs):
    "Create an object with multiple keys and values."
    j = dict(**kwargs)
    j["_type"] = _type
    j["_tool"] = TOOL
    j["_id"] = mmh3.hash128("|".join(
        (key.replace("|", "||") + "|" + value.replace("|", "||")
         for key, value in kwargs.items())))
    return j
예제 #32
0
파일: routers.py 프로젝트: abrookins/quest
def hash_to_bucket(user_id, num_buckets):
    """Consistently hash `user_id` into buckets of length `num_buckets`.

    Approach derived from: https://stats.stackexchange.com/questions/26344/how-to-uniformly-project-a-hash-to-a-fixed-number-of-buckets
    """
    i = mmh3.hash128(str(user_id))
    p = i / float(2**128)
    for j in range(0, num_buckets):
        if j / float(num_buckets) <= p and (j + 1) / float(num_buckets) > p:
            return j + 1
    return num_buckets
예제 #33
0
def getSketch(dnaStr,k,seedList):

    colNames = ['%d' %(i) for i in range(len(seedList))]

    sketch=[]
    for seed in seedList:
        hashvals = [ mmh3.hash128(compareFwdRev(dnaStr[i:i+k]),seed) for i in range(len(dnaStr)-k) ]
        sketch += [np.min(hashvals)]

    sketchSeries = pd.Series(data = sketch, index=colNames)

    return sketchSeries
예제 #34
0
 def check(self, item):
     '''
     Check for existence of an item in filter
     '''
     for i in range(self.hash_count):
         digest = mmh3.hash128(item, i) % self.size
         if self.bit_array[digest] == False:
             # if any of bit is False then,its not present
             # in filter
             # else there is probability that it exist
             return False
     return True
def FM(stream, r):  #r is the number of estimates needed
    salt = np.random.randint(1 << 30, size=r)
    z = [0] * r  # z[i] counts the max no. trailing zeros for ith hash fn.

    for x in stream:
        for i in range(r):
            y = mmh3.hash128(str(x) + str(salt[i]))
            itob = bin(y)[2:]  #convert integer to binary in string
            zeros = len(itob) - len(
                itob.rstrip('0'))  #compute the trailing zeros
            z[i] = max(z[i], zeros)
    return z
예제 #36
0
def remap_items(filename, outfilename, feature_map, offer_field='offers', enumerate=False):
    """
    Remap items
    :param filename:
    :param outfilename:
    :return:
    """
    meta = RecordMeta(open(filename + '.meta').readline().strip().split())
    Record = make_record_cls(meta.fields())

    mapping = {}
    with open(outfilename, 'w') as outfile:
        for line in open(filename):
            splitted = line.strip().split('\t')
            rec = Record(*splitted)
            items = [offer for offer in getattr(rec, offer_field).split() if offer.isdigit()]
            if not items:
                continue
            counter_id = rec.counter_id

            new_items = []
            for item in items:
                offer_hash = mmh3.hash128("%s_%s" % (counter_id, item))

                if offer_hash not in mapping:
                    index = len(mapping)
                    mapping[offer_hash] = (index, counter_id, item)
                else:
                    index = mapping[offer_hash][0]

                if enumerate:
                    new_items.append(str(index))
                else:
                    new_items.append(str(offer_hash))



            new_rec_data = dict([(f, getattr(rec, f)) for f in meta.fields()])
            new_rec_data[offer_field] = ' '.join(new_items)


            new_rec = '\t'.join([new_rec_data[field] for field in meta.fields()])
            outfile.write("%s\n" % new_rec)


    with open(feature_map, 'w') as fmap:
        for offer_hash, (i, counter_id, item) in mapping.iteritems():
            fmap.write("%s\t%s\t%s\t%s\n" % (offer_hash, i, counter_id, item))

    with open(feature_map + '.meta', 'w') as fmap:
        fmap.write("offer_hash\tmap\tcounter_id\toffer_id\n")
예제 #37
0
    def return_design_matrix(self, decision_state, reward=None, weight=1, critic_model=False):
        """
        Design matrix can simply return catesian product of state and decision
        For now all categorical features
        """
        if self.model_class == 'lookup_table':
            return decision_state, reward

        else:
            state, decision_taken = decision_state
            state_namespace = " |state " + " ".join(state) + " " +  "tag_" + str(mmh3.hash128("_".join(state)))
            decision_namespace = " |decision " + "action_" + str(decision_taken)
            input_str = state_namespace + decision_namespace + '\n'

            # Do this after cache retrieval
            if reward:
                output = str(reward) + " " + str(weight)
                fv = output + input_str
            else:
                fv = input_str

            return fv, reward
예제 #38
0
	def lookup(self, new_string):
		a,b = mmh3.hash64(new_string)
		if(self.bloom_array[mmh3.hash(new_string) % 1000000] == 0 or self.bloom_array[mmh3.hash128(new_string)%1000000] == 0 or self.bloom_array[a % 1000000] == 0 or self.bloom_array[b%1000000] == 0):
			return False
		else:
			return True
예제 #39
0
파일: stutils.py 프로젝트: btorch/stalker
def genPrimaryKey64(data):
    return "%x" % (mmh3.hash128(data) & 0xFFFFFFFFFFFFFFFF)
예제 #40
0
	def add(self, new_string):
		self.bloom_array[mmh3.hash(new_string) % 1000000] = 1
		self.bloom_array[mmh3.hash128(new_string) % 1000000] = 1
		a,b = mmh3.hash64(new_string)
		self.bloom_array[a % 1000000] = 1
		self.bloom_array[b % 1000000] = 1
예제 #41
0
def hash_string(s):
    return "html2latex_{version}_{mmh3_hash}_{hmac_of_sha512_hash}".format(
        version=VERSION,
        mmh3_hash=mmh3.hash128(s),
        hmac_of_sha512_hash=hmac.new(hashlib.sha512(s).hexdigest()).hexdigest(),
    )
예제 #42
0
파일: test_mmh3.py 프로젝트: veegee/mmh3
 def test_hash_128(self):
     h = mmh3.hash128('hello')
     assert h == 121118445609844952839898260755277781762
예제 #43
0
def hashFiles(files):
    fileMap = {}
    for f in files:
        fileMap[mmh3.hash128(f)] = f
    return fileMap
예제 #44
0
 def obfuscateDecimal(self, blob):
     return mmh3.hash128(blob) & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
예제 #45
0
 def _hash(self, item):
     bloom = 0
     for salt in self.salts:
         bloom |= (1L << (mmh3.hash128(salt + str(item)) % self.m))
     return bloom
예제 #46
0
    def build_key(params):

        joined = ','.join(['%s=%s' % (PersonCounter.PARAM_KEYS[i], value) for i, value in enumerate(params) if value])
        return 'person-count__%s' % mmh3.hash128(joined)
예제 #47
0
 def sparsify(self, x):
     sparse_x = np.nonzero(x)[0]
     tag = str(mmh3.hash128("_".join('pix_' + str(i) for i in sparse_x)))
     state = " |state " + " ".join('pix_' + str(i) for i in sparse_x)  + " tag_" + tag
     return state