예제 #1
0
def ft_process(text):
	# splitting on lines
	lines = text.split('\n')
	# assigning max lowest to all possible hash values
	global max_0
	for line in lines:
		# skipping blank entries
		if len(line) == 0:	continue
		# skipping lines that don't begin with 'Q'
		if line[0] =="Q":
			quote = line[2:]

			for i in range(10):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.super_fast_hash(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0
			for i in range(10,20):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.murmur3_32(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0
			for i in range(20,30):
				# calculating # of tailing 0's for murmurhash
				hash_value = pyhash.xx_32(seed = i)(quote)
				tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1)
				# assigning # of tailing 0's as 0 if no 1's found in bin string
				if tail_0 == len(bin(hash_value)[2:]):	tail_0 = 0
				if tail_0 > max_0[i]:	max_0[i] = tail_0

	return
예제 #2
0
    def __init__(self, conf):
        self.hashes = conf["hashes"]
        self.b = conf["b"]
        self.r = conf["r"]
        self.band_seed = conf["band_seed"]
        self.seeds = conf["seeds"]

        self.hasher = pyhash.murmur3_32()
예제 #3
0
    def __init__(self, conf):
        self.hashes = conf['hashes']
        self.b = conf['b']
        self.r = conf['r']
        self.band_seed = conf['band_seed']
        self.seeds = conf['seeds']

        self.hasher = pyhash.murmur3_32()
예제 #4
0
    def __init__(self, conf):
        self.hashes = conf['hashes']
        self.b = conf['b']
        self.r = conf['r']
        self.band_seed = conf['band_seed']
        self.seeds = conf['seeds']

        self.hasher = pyhash.murmur3_32()
예제 #5
0
def _run_minhash(A, B, data, seed, num_perm, b):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm, hashobj=Hash)
    m2 = MinHash(num_perm=num_perm, hashobj=Hash)
    for i in xrange(a_start, a_end):
        m1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        m2.update(hasher(data[i], seed=seed))
    return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
예제 #6
0
def _run_minhash(A, B, data, seed, num_perm):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return jaccard([m1, m2])
예제 #7
0
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p, hashobj=Hash)
    h2 = HyperLogLog(p=p, hashobj=Hash)
    for i in xrange(a_start, a_end):
        h1.update(hasher(data[i], seed=seed))
    for i in xrange(b_start, b_end):
        h2.update(hasher(data[i], seed=seed))
    return _hyperloglog_jaccard(h1, h2)
예제 #8
0
def _run_minhash(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=2**p)
    m2 = MinHash(num_perm=2**p)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return _minhash_inclusion(m1, m2)
예제 #9
0
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p)
    h2 = HyperLogLog(p=p)
    for i in xrange(a_start, a_end):
        h1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        h2.digest(Hash(hasher(data[i], seed=seed)))
    return _hyperloglog_inclusion(h1, h2)
def _run_minhash(A, B, data, seed, bs, num_perm):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)
    for i in xrange(a_start, a_end):
        m1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        m2.digest(Hash(hasher(data[i], seed=seed)))
    return [m1.jaccard(m2)] + \
            [_b_bit_minhash_jaccard(m1, m2, b) for b in bs]
예제 #11
0
 def __init__(self,filter_size):
   '''
   First, we initialize the Bloom filter and create a bit array of filter_size entered by the user.
   In addition, we initialize each of the hash functions (Murmur3 and FNV-1) of our filter.
   
   filter_size: size of the vector
   
   '''
     self.filter = filter_size
     self.bit_array = bitarray(filter_size)
     self.bit_array.setall(0) #since bitarray doesn't ensure that all bits are set to 0
     self.hasher1 = pyhash.fnv1_32()
     self.hasher2 = pyhash.murmur3_32()
예제 #12
0
 def __init__(self,
              set_data_path: str,
              hash_mod: int = 10000,
              number_of_hashes: int = 20):
     self.set_data_path = set_data_path
     self.hasher = pyhash.murmur3_32()
     self.bucket_numbers = {}
     if os.path.exists(SETS_PATH % self.set_data_path):
         with open(SETS_PATH % self.set_data_path, "rb") as sets_file:
             self.hashing_sets, self.bucket_numbers = pickle.load(sets_file)
     else:
         self.hashing_sets = [set() for _ in range(number_of_hashes)]
     self.hash_mod = hash_mod
     self.contains_lru = deque(maxlen=CONTAINS_CACHE_SIZE)
     self.add_lru = deque(maxlen=MAX_UNCOMMITED)
     self.writeahead_log = self._writeahead_init()
     self.log_uncommited = []
예제 #13
0
 def __init__(self,
              set_data_path: str,
              hash_mod: int = 10000,
              number_of_hashes: int = 20,
              commit_number: Optional[int] = None,
              recover_state_on_init: bool = False):
     super().__init__()
     self.set_data_path = set_data_path
     self.hash_mod = hash_mod
     self.number_of_hashes = number_of_hashes
     self.contains_lru = deque(maxlen=CONTAINS_CACHE_SIZE)
     self.add_lru = deque(maxlen=ADD_LRU)
     self.writeahead_log = None
     self.hasher = pyhash.murmur3_32()
     self.prepare_buffer = []
     self.hashing_sets = []
     self.bucket_numbers = {}
     # This allows an invalid state in the object, sorry fontela :(
     if recover_state_on_init:
         self.recover_state(commit_number)
예제 #14
0
 def __init__(self):
     self.DIVISOR = 20
     self.big_vector = [0] * self.DIVISOR
     self.fnv = fnv1_32()
     self.murmur = murmur3_32()
예제 #15
0
파일: lsh.py 프로젝트: faisal-w/pLSH-HDC
 def __init__(self, dim, threshold):
     self.dim = dim
     self.threshold = threshold
     self.bandwidth = self.get_bandwidth(dim, threshold)
     self.hasher = pyhash.murmur3_32()
예제 #16
0
파일: lsh.py 프로젝트: faisal-w/pLSH-HDC
 def __init__(self, dim, seeds=None):
     self.dim = dim
     self.seeds = self._set_seeds(seeds)
     self.hasher = pyhash.murmur3_32()
     self.hashes = self._hash_functions()
예제 #17
0
 def __init__(self, path):
     self.path = path
     self.num_scanned = 0
     Thread.__init__(self)
     self.hasher = pyhash.murmur3_32()
예제 #18
0
import pyhash

bit_vector = [0] * 20

fnv = pyhash.fnv1_32()
fnv = pyhash.murmur3_32()
예제 #19
0
import string
from itertools import product

NONE, PYHASH, PURE_PYTHON_HASH, CACHED = 0, 1, 2, 3
hash_mode = PURE_PYTHON_HASH 

if hash_mode == PYHASH:
    import pyhash
    hasher = pyhash.murmur3_32()
    def hash_function(seq):
        return hasher(seq) % 1000000
elif hash_mode == PURE_PYTHON_HASH:
    def hash_integer(elem):
        # RanHash
        v = elem * 3935559000370003845 + 2691343689449507681;
        v = v ^ (v >> 21);
        v = v ^ (v << 37);
        v = v ^ (v >>  4);
        v = v * 4768777513237032717;
        v = v ^ (v << 20);
        v = v ^ (v >> 41);
        v = v ^ (v <<  5);
        return v
    def hash_function(seq):
        return hash_integer(hash(seq)) % 1000000

hash_dict = None
def precompute_hashes(m):
    global hash_dict, hash_mode
    if hash_mode == NONE:
        return
예제 #20
0
# Section 2 - Bloom Implementation
"""

!pip install bitarray

from bitarray import bitarray

!pip install pyhash

import pyhash

from pyhash import murmur3_32 #importing non-crypographic hash functions
from pyhash import fnv1_32

fnv_hasher = pyhash.fnv1_32()
murmur_hasher = pyhash.murmur3_32()

class Bloom():
  """Bloom Filter"""
  
    def __init__(self,filter_size):
      '''
      First, we initialize the Bloom filter and create a bit array of filter_size entered by the user.
      In addition, we initialize each of the hash functions (Murmur3 and FNV-1) of our filter.
      
      filter_size: size of the vector
      
      '''
        self.filter = filter_size
        self.bit_array = bitarray(filter_size)
        self.bit_array.setall(0) #since bitarray doesn't ensure that all bits are set to 0
예제 #21
0
    def setUp(self):
        self.unigrams = set()
        self.bigrams = set()
        self.char_trigrams = set()

        self.cat_tokens = set()
        self.attr_tokens = set()

        self.hasher = pyhash.murmur3_32()
        self.unknown_bin = 16

        self.feature_precomputed = {
            "reviews": [0.0, 3437.0],
            "rating": [0.0, 100.0],
            "sales_monthly": [0.0, 14345.0],
            "sales_yearly": [0.0, 136592.0],
            "support_p2h_delivery": [0.0, 1.0]
        }
        self.header_fields = [
            "reviews", "rating", "sales_monthly", "sales_yearly",
            "support_p2h_delivery"
        ]

        p1 = {
            'product_name':
            'Ổ Cứng SSD Kingston HyperX FURY 120GB - SATA III - Hàng Chính Hãng',
            'brand': 'Kingston',
            'author': '',
            'attributes':
            '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch',
            'categories':
            '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ',
            'reviews': 100,
            'rating': 80,
            'sales_monthly': 20,
            'sales_yearly': 100,
            'support_p2h_delivery': 1
        }

        p2 = {
            'product_name':
            'Ổ Cứng SSD Sata III 2.5 Inch Samsung 850 EVO 120GB - Hàng Chính Hãng',
            'brand': 'Samsung',
            'author': '',
            'attributes':
            '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch',
            'categories':
            '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ',
            'reviews': 71,
            'rating': 95,
            'sales_monthly': 10,
            'sales_yearly': 50,
            'support_p2h_delivery': 1
        }

        p3 = {
            'product_name':
            'Mai Em Vào Lớp 1 - Vở Tập Tô Chữ (Dành Cho Trẻ 5 - 6 Tuổi) - Tập 1',
            'brand': '',
            'author': ['Lê Hồng Đăng - Lê Thị Ngọc Ánh'],
            'attributes': '',
            'categories':
            '8322#2#Nhà Sách Tiki|316#3#Sách tiếng Việt |393#4#Sách thiếu nhi |853#5#Kiến thức - Bách khoa',
            'reviews': 0,
            'rating': 0,
            'sales_monthly': 3,
            'sales_yearly': 10,
            'support_p2h_delivery': 1
        }

        self.products = [p1, p2, p3]
        self.queries = ['ổ cứng', 'samsung 850', 'vở tập tô']
        self.target = [2, 1, 0]  # positive, impressed, negative
        self.weights = [1., 1., 1.]

        for p in self.products:
            self.add_to_vocab(query_preprocessing(p['product_name']))

            for z in p['attributes'].split('|'):
                t = "#".join(z.split("#")[:2])
                self.attr_tokens.add(t)

            for z in p['categories'].split('|'):
                t = "#".join(z.split("#")[:2])
                self.cat_tokens.add(t)

        self.vocab = self.unigrams.copy()
        self.vocab = self.vocab.union(self.bigrams, self.char_trigrams)
        self.vocab = list(self.vocab)
        self.zero_idx = len(self.vocab) + self.unknown_bin

        self.token_2_idx = {}
        for i, t in enumerate(self.vocab):
            self.token_2_idx[t] = i

        self.cat_token_2_idx = {}
        for i, t in enumerate(self.cat_tokens):
            self.cat_token_2_idx[t] = i
        self.cat_zero_idx = len(self.cat_tokens)

        self.attr_token_2_idx = {}
        for i, t in enumerate(self.attr_tokens):
            self.attr_token_2_idx[t] = i
        self.attr_zero_idx = len(self.attr_tokens)

        self.embed_size = 80
        self.attr_cat_embed_size = 10
        self.vocab_size = len(self.token_2_idx)
        self.max_query_length = 25
        self.max_product_length = 50
        self.max_brand_length = 25
        self.max_author_length = 25
        self.max_attr_length = 10
        self.max_cat_length = 10
        self.filter_sizes = [2, 3, 4, 5]
        self.num_filters = 5
예제 #22
0
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p)
    for d in data:
        h.digest(Hash(hasher(d, seed=seed)))
    return h.count()
예제 #23
0
파일: lsh.py 프로젝트: englehardt/pLSH-HDC
 def __init__(self, dim, seeds=None):
     self.dim = dim
     self.seeds = self._set_seeds(seeds)
     self.hasher = pyhash.murmur3_32()
     self.hashes = self._hash_functions()
예제 #24
0
    def __init__(
        self, pair_paths,
        precomputed_path,
        product_db,
        vocab_path,
        cat_tokens_path, 
        attr_tokens_path,
        maximums_query=[25, 25, 125],#for unigram, bigram, character trigrams
        maximums_product_name=[50, 50, 250], #for unigram, bigram, character trigrams
        maximums_brand=[10, 10, 50],
        maximums_author=[10, 10, 50],
        maximums_cat=[10, 10, 20], #for unigram, bigram, character trigrams
        maximums_attr=[10, 10, 20], #for unigram, bigram, character trigrams
        unknown_bin=8012):

        self.vocab = []
        with open(vocab_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.vocab.append(l.strip())
        self.cat_tokens = []
        with open(cat_tokens_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.cat_tokens.append(l.strip())
        self.attr_tokens = []
        with open(attr_tokens_path, 'r') as fobj:
            for l in fobj:
                if len(l.strip()):
                    self.attr_tokens.append(l.strip())

        with open(precomputed_path, 'r') as fobj:
            self.precomputed = json.load(fobj)

        self.vocab_size = len(self.vocab)
        self.cat_tokens_size = len(self.cat_tokens)
        self.attr_tokens_size = len(self.attr_tokens)

        self.unknown_bin = unknown_bin

        self.maximums_query = maximums_query
        self.maximums_product_name = maximums_product_name
        self.maximums_brand = maximums_brand
        self.maximums_author = maximums_author
        self.maximums_cat = maximums_cat
        self.maximums_attr = maximums_attr
        
        self.token_2_idx = {}
        self.cat_token_2_idx = {}
        self.attr_token_2_idx = {}
        
        self.zero_idx = len(self.vocab) + self.unknown_bin
        for i, w in enumerate(self.vocab):
            self.token_2_idx[w] = i
        
        self.cat_zero_idx = len(self.cat_tokens)
        for i, w in enumerate(self.cat_tokens):
            self.cat_token_2_idx[w] = i
        
        self.attr_zero_idx = len(self.attr_tokens)
        for i, w in enumerate(self.attr_tokens):
            self.attr_token_2_idx[w] = i

        self.hasher = pyhash.murmur3_32()

        # initialize sampling pools
        self.pair_paths = pair_paths
        self.precomputed_path = precomputed_path

        # self.conn = create_connection(product_db)
        # self.headers = get_fields(self.conn)

        if product_db:
            self.product_dict = {}
            with open(product_db, "r") as fobj:
                csv_reader= csv.DictReader(fobj)
                for i, r in enumerate(csv_reader):
                    r = dict(r)
                    r["name"] = query_preprocessing(r.get("name"))
                    r["brand"] = query_preprocessing(r.get("brand"))
                    r["author"] = query_preprocessing(r.get("author"))
                    self.product_dict[r.get("product_id")] = r
                    if i % 100000 == 0:
                        print("Loaded %d products" % i)

            self.product_ids =  list(self.product_dict.keys())
예제 #25
0
def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p)
    for d in data:
        m.digest(Hash(hasher(d, seed=seed)))
    return m.count()
예제 #26
0
파일: utils.py 프로젝트: zz202/cvat
# Copyright (C) 2019 Intel Corporation
#
# SPDX-License-Identifier: MIT

import os.path as osp
from pyhash import murmur3_32

from datumaro.cli.util import make_file_name

hasher = murmur3_32()

def get_color_from_index(index):
    def get_bit(number, index):
        return (number >> index) & 1

    color = [0, 0, 0]

    for j in range(7, -1, -1):
        for c in range(3):
            color[c] |= get_bit(index, c) << j
        index >>= 3

    return tuple(color)

DEFAULT_COLORMAP_CAPACITY = 2000
DEFAULT_COLORMAP_PATH = osp.join(osp.dirname(__file__), 'predefined_colors.txt')
def parse_default_colors(file_path=None):
    if file_path is None:
        file_path = DEFAULT_COLORMAP_PATH

    colors = {}
예제 #27
0
 def __init__(self, by_key: str, shards: int):
     self.by_key = by_key
     self.shards = shards
     self.hasher = pyhash.murmur3_32()
예제 #28
0
import pyhash as ph

# Non cryptographic hash functions (Murmur and FNV)
fnv = ph.fnv1_32()
murmur = ph.murmur3_32()

# Calculate the output of FNV and Murmur hash functions for pikachu and chamander

bit_vector = [0] * 20

fnv_pika = fnv("Pikachu") % 20
murmur_pika = murmur("Pikachu") % 20

fnv_char = fnv("Charmander") % 20
murmur_char = murmur("Charmander") % 20

print("fnv_pika\t", fnv_pika)
print("fnv_char\t", fnv_char)
print("murmur_pika\t", murmur_pika)
print("murmur_char\t", murmur_char)

bit_vector[fnv_char] = 1
bit_vector[fnv_pika] = 1

bit_vector[murmur_char] = 1
bit_vector[murmur_pika] = 1
print(bit_vector)

# Se um deles der 0 ou os dois, o item não está no bloom filter
예제 #29
0
    sys.stdout.write("{0}: {1}\n".format(prog, msg))

def echo(msg):
    sys.stdout.write(msg + "\n")

def int_as_bytearray(i):
    # Doing this avoids the leading 0x and trailing L (on longs) that
    # hex() adds.
    as_hex = "%x" % i

    # The ``if... else...`` bit is to ensure that we have an even
    # number of digits.
    return bytearray.fromhex(as_hex if not len(as_hex) % 2
                             else "0" + as_hex)

hasher = murmur3_32()
def get_hash(src):
    return base64.urlsafe_b64encode(int_as_bytearray(hasher(src))) \
                 .rstrip("=")

def get_src_path(working_dir):
    return os.path.join(working_dir, "src")

class Command(object):

    def __init__(self, args):
        """
        A command on the command line. This backup software takes a
        command as its first argument.
        """
        self.args = args
예제 #30
0
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p, hashobj=Hash)
    for d in data:
        h.update(hasher(d, seed=seed))
    return h.count()
예제 #31
0
def _run_minhash(data, seed, p):
    hasher = pyhash.murmur3_32()
    m = MinHash(num_perm=2**p, hashobj=Hash)
    for d in data:
        m.update(hasher(d, seed=seed))
    return m.count()
def bloomIPs(clientSize):
    """
    I will implement the IP blocker above example.
    For simplicity, lets assume our IP system is composed of values between
    0-100000
    """
    #My bit vector
    bitVector = [0] * clientSize
    """
    We use 3 hash functions, Murmur, FNV and metro hash systems. They are 
    non cryptographic hence will return the same value any time we pass in
    the same value. We modularize them by our bit  Vector size to make them
    fit into it as it is our reference sheet.
    """
    fnv = pyhash.fnv1_32()
    murmur = pyhash.murmur3_32()
    metro = pyhash.metro_128()
    """
    Now lets imagine we identified a set of just 1000 hackers in our world.
    Their IPs range from 0 to 1000 as follows.
    """
    hackerSize = 1000
    hackersList = range(0, hackerSize)
    """
    To keep our random clients happy, we come up with a repo of all the 
    hackers Known. Our customers are very stubborn but they love being safe.
    Its a very dangerous world out there.
    We mark the hackers.
    """
    for hacker in hackersList:
        #Hash them with our 3 functions
        bitVector[fnv(str(hacker)) % clientSize] = 1
        bitVector[murmur(str(hacker)) % clientSize] = 1
        bitVector[metro(str(hacker)) % clientSize] = 1
    """
    Now our true clients make requests. We have say 100000 of them.
    We look them up in our list and determine if they are hackers or not
    An approved request is marked as Perfect. Lets count, of the 700,
    False Positives are clients Noted as Hackers
    How many will be marked perfect
    """
    perfect = 0
    falsePositive = 0
    for cust in range(0, 100000):
        trueClient = random.randrange(10000, 100000)
        check1 = bitVector[fnv(str(trueClient)) % clientSize]
        check2 = bitVector[murmur(str(trueClient)) % clientSize]
        check3 = bitVector[metro(str(trueClient)) % clientSize]
        #print("{}-{}-{}").format(check1,check2,check3)
        """
        We will not grant perfection to them if they are detected as hackers
        by any of our security systems, we mark the false positive.
        Othewise we just think of them as false negatives
        """
        checkFinale = (check1 == check2 == check3 == 1)
        if checkFinale is True:
            falsePositive += 1
        else:
            perfect += 1
    doc = """
    Running our check, we wil throw warnings to some true clients thinking
    they are hackers when they are in fact not. Run it again to see how
    many true clients connect. At least we know they are safe. But as seen.
    there is a possibility of our clients, whose IPs are not even in the
    same range as the hackers to be detected as hackers."""
    return {
        "hackerSize": hackerSize,
        "falsePositive": falsePositive,
        "clientSize": clientSize,
        "doc": doc,
        "perfect": perfect
    }
예제 #33
0
#https://code.google.com/p/pyfasthash/

h_fnv1_32 = pyhash.fnv1_32()
def fnv1_32(req):
    return h_fnv1_32(str(req))


h_lookup3 = pyhash.lookup3_big()
def lookup3(req):
    return h_lookup3(str(req))

h_super_fast_hash = pyhash.super_fast_hash()
def super_fast_hash(req):
    return h_super_fast_hash(str(req))


h_murmur2_x64_64a = pyhash.murmur2_x64_64a()
def murmur2_x64_64a(req):
    return h_murmur2_x64_64a(str(req))


h_murmur3_32 = pyhash.murmur3_32()
def murmur3_32(req):
    return h_murmur3_32(str(req))

h_fnv1a_64 = pyhash.fnv1a_64()
def fnv1a_64(req):
    return h_fnv1a_64(str(req))

    
예제 #34
0
is not

EG: I run website and want to keep track of IP
addresses that are blocked. I dont care if a blocked IP
is occasionally able to access my website, but I do care if someone not 
on the blocked list is unable to access the site

bit_vector is list of bits
"""
import pyhash

bit_vector = [0] * 20

#Non Cryptographic hash functions (murmer and FNV)
fnv = pyhash.fnv1_32()
murmur = pyhash.murmur3_32()

#Calculate the output of FNV and Murmur hash functions for Pikachu and Charmander
fnv_pika = fnv("Pikachu") % 20
fnv_char = fnv("Charmander") % 20

murmur_pika = murmur("Pikachu") % 20
murmur_char = murmur("Charmander") % 20

bit_vector[fnv_pika] = 1
bit_vector[murmur_pika] = 1

bit_vector[fnv_char] = 1
bit_vector[murmur_char] = 1

#print(fnv_pika)
예제 #35
0
def test_default_string_type():
    hasher = pyhash.murmur3_32()

    assert hasher('foo') == hasher(u'foo')
    assert hasher('foo') != hasher(b'foo')
예제 #36
0
파일: lsh.py 프로젝트: englehardt/pLSH-HDC
 def __init__(self, dim, threshold):
     self.dim = dim
     self.threshold = threshold
     self.bandwidth = self.get_bandwidth(dim, threshold)
     self.hasher = pyhash.murmur3_32()
예제 #37
0
def worker(wid, queue, csv_queue, limit_sample, batch_size, precomputed_path,
           product_db, vocab_path, cat_tokens_path, attr_tokens_path,
           maximums_query, maximums_product_name, maximums_brand,
           maximums_author, maximums_cat, maximums_attr, unknown_bin):

    hasher = pyhash.murmur3_32()

    def unknown_to_idx():
        def _inside(unknown):
            return hasher(unknown) % unknown_bin

        return _inside

    meta_inst = MetaData(precomputed_path, product_db, vocab_path,
                         cat_tokens_path, attr_tokens_path, maximums_query,
                         maximums_product_name, maximums_brand,
                         maximums_author, maximums_cat, maximums_attr,
                         unknown_bin)
    product_ids = meta_inst.product_ids
    print("Data worker %d started" % wid)

    total_sample = 0
    while True:
        if queue.qsize() > 1000:
            time.sleep(0.1)
            continue

        for _ in range(64):
            queries = []
            labels = []
            products = []
            qids = []
            count_keyword = 0
            unique_queries = []
            count_qs = []
            count_t = 0

            for k in range(batch_size):
                r = csv_queue.get()
                keyword = r[0]
                r1 = r[1]
                if len(keyword) == 0:
                    continue
                count_keyword += 1
                pk = r1.split("|")
                pnk = [z.split("#") for z in pk]
                pos = []
                zero = []
                neg = []

                for p in pnk:
                    if p[1] == '2':
                        pos.append(p[0])
                    elif p[1] == '1':
                        zero.append(p[0])
                    else:
                        neg.append(p[0])
                n = min(1, len(pos))
                if n > 6:
                    n = 4
                    pos = random.sample(pos, n)
                if n == 0:
                    n = len(zero)
                    if n > 6:
                        n = 4
                        zero = random.sample(zero, n)
                    if n:
                        neg = random.sample(product_ids,
                                            n * 7) + random.sample(
                                                neg, min(len(neg), 8))
                        pass
                    else:
                        continue
                else:
                    zero = random.sample(zero, min(len(zero), n * 6))
                    neg = random.sample(product_ids, n * 7) + random.sample(
                        neg, min(len(neg), 8))

                count_q = 0
                for samples, l in zip([pos, zero, neg], [2, 1, 0]):
                    for s in samples:
                        product = meta_inst.get_product(s)
                        if product:
                            count_q += 1
                            queries.append(keyword)
                            qids.append(count_keyword)
                            products.append(product)
                            labels.append(l)
                if count_q:
                    unique_queries.append(keyword)
                    count_qs.append(count_q)

            query_unigram_indices = []
            query_bigram_indices = []
            query_char_trigram_indices = []

            for q, r in zip(unique_queries, count_qs):
                u, b, t =  \
                convert_strings(
                    [q], meta_inst.token_2_idx, meta_inst.zero_idx,
                    meta_inst.maximums_query[0], meta_inst.maximums_query[1], meta_inst.maximums_query[2],
                    unknown_to_idx())
                query_unigram_indices.append(np.tile(u, (r, 1)))
                query_bigram_indices.append(np.tile(b, (r, 1)))
                query_char_trigram_indices.append(np.tile(t, (r, 1)))
            query_unigram_indices = np.concatenate(query_unigram_indices,
                                                   axis=0)
            query_bigram_indices = np.concatenate(query_bigram_indices, axis=0)
            query_char_trigram_indices = np.concatenate(
                query_char_trigram_indices, axis=0)

            product_unigram_indices = []
            product_bigram_indices = []
            product_char_trigram_indices = []

            brand_unigram_indices = []
            brand_bigram_indices = []
            brand_char_trigram_indices = []

            author_unigram_indices = []
            author_bigram_indices = []
            author_char_trigram_indices = []

            cat_tokens = []
            cat_in_product = []
            cat_unigram_indices = []
            cat_bigram_indices = []
            cat_char_trigram_indices = []

            attr_tokens = []
            attr_in_product = []
            attr_unigram_indices = []
            attr_bigram_indices = []
            attr_char_trigram_indices = []

            features = []

            for p in products:
                product_unigram_indices.append(
                    np.frombuffer(p.get("product_unigram_indices"),
                                  dtype=np.int32))
                product_bigram_indices.append(
                    np.frombuffer(p.get("product_bigram_indices"),
                                  dtype=np.int32))
                product_char_trigram_indices.append(
                    np.frombuffer(p.get("product_char_trigram_indices"),
                                  dtype=np.int32))
                brand_unigram_indices.append(
                    np.frombuffer(p.get("brand_unigram_indices"),
                                  dtype=np.int32))
                brand_bigram_indices.append(
                    np.frombuffer(p.get("brand_bigram_indices"),
                                  dtype=np.int32))
                brand_char_trigram_indices.append(
                    np.frombuffer(p.get("brand_char_trigram_indices"),
                                  dtype=np.int32))
                author_unigram_indices.append(
                    np.frombuffer(p.get("author_unigram_indices"),
                                  dtype=np.int32))
                author_bigram_indices.append(
                    np.frombuffer(p.get("author_bigram_indices"),
                                  dtype=np.int32))
                author_char_trigram_indices.append(
                    np.frombuffer(p.get("author_char_trigram_indices"),
                                  dtype=np.int32))

                cat_tokens.append(
                    np.frombuffer(p.get("cat_tokens"), dtype=np.int32))
                cip = int(
                    np.frombuffer(p.get("cat_in_product"), dtype=np.int32))
                cat_in_product.append(cip)
                cat_unigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("cat_unigram_indices"),
                                      dtype=np.int32),
                        (cip, meta_inst.maximums_cat[0])))
                cat_bigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("cat_bigram_indices"),
                                      dtype=np.int32),
                        (cip, meta_inst.maximums_cat[1])))
                cat_char_trigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("cat_char_trigram_indices"),
                                      dtype=np.int32),
                        (cip, meta_inst.maximums_cat[2])))

                attr_tokens.append(
                    np.frombuffer(p.get("attr_tokens"), dtype=np.int32))
                aip = int(
                    np.frombuffer(p.get("attr_in_product"), dtype=np.int32))
                attr_in_product.append(aip)
                attr_unigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("attr_unigram_indices"),
                                      dtype=np.int32),
                        (aip, meta_inst.maximums_attr[0])))
                attr_bigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("attr_bigram_indices"),
                                      dtype=np.int32),
                        (aip, meta_inst.maximums_attr[1])))
                attr_char_trigram_indices.append(
                    np.reshape(
                        np.frombuffer(p.get("attr_char_trigram_indices"),
                                      dtype=np.int32),
                        (aip, meta_inst.maximums_attr[2])))

                features.append(
                    np.frombuffer(p.get("features"), dtype=np.float32))

            product_unigram_indices = np.stack(product_unigram_indices)
            product_bigram_indices = np.stack(product_bigram_indices)
            product_char_trigram_indices = np.stack(
                product_char_trigram_indices)

            brand_unigram_indices = np.stack(brand_unigram_indices)
            brand_bigram_indices = np.stack(brand_bigram_indices)
            brand_char_trigram_indices = np.stack(brand_char_trigram_indices)

            author_unigram_indices = np.stack(author_unigram_indices)
            author_bigram_indices = np.stack(author_bigram_indices)
            author_char_trigram_indices = np.stack(author_char_trigram_indices)

            cat_tokens = np.concatenate(cat_tokens)
            cat_in_product = np.array(cat_in_product, dtype=np.int32)
            cat_unigram_indices = np.concatenate(cat_unigram_indices, axis=0)
            cat_bigram_indices = np.concatenate(cat_bigram_indices, axis=0)
            cat_char_trigram_indices = np.concatenate(cat_char_trigram_indices,
                                                      axis=0)

            attr_tokens = np.concatenate(attr_tokens)
            attr_in_product = np.array(attr_in_product, dtype=np.int32)
            attr_unigram_indices = np.concatenate(attr_unigram_indices, axis=0)
            attr_bigram_indices = np.concatenate(attr_bigram_indices, axis=0)
            attr_char_trigram_indices = np.concatenate(
                attr_char_trigram_indices, axis=0)

            features = np.stack(features)

            labels = np.asarray(labels, dtype=np.int32)
            qids = np.asarray(qids, dtype=np.int32)

            queue.put([
                query_unigram_indices, query_bigram_indices,
                query_char_trigram_indices, product_unigram_indices,
                product_bigram_indices, product_char_trigram_indices,
                brand_unigram_indices, brand_bigram_indices,
                brand_char_trigram_indices, author_unigram_indices,
                author_bigram_indices, author_char_trigram_indices, cat_tokens,
                cat_in_product, cat_unigram_indices, cat_bigram_indices,
                cat_char_trigram_indices, attr_tokens, attr_in_product,
                attr_unigram_indices, attr_bigram_indices,
                attr_char_trigram_indices, features, count_keyword, qids,
                labels
            ])

            total_sample += 1
            if total_sample > limit_sample:
                queue.put(None)
                break

        if total_sample > limit_sample:
            # queue.put(None)
            break
    meta_inst.conn.close()
    print("Worker-%d Exiting" % wid)