def ft_process(text): # splitting on lines lines = text.split('\n') # assigning max lowest to all possible hash values global max_0 for line in lines: # skipping blank entries if len(line) == 0: continue # skipping lines that don't begin with 'Q' if line[0] =="Q": quote = line[2:] for i in range(10): # calculating # of tailing 0's for murmurhash hash_value = pyhash.super_fast_hash(seed = i)(quote) tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1) # assigning # of tailing 0's as 0 if no 1's found in bin string if tail_0 == len(bin(hash_value)[2:]): tail_0 = 0 if tail_0 > max_0[i]: max_0[i] = tail_0 for i in range(10,20): # calculating # of tailing 0's for murmurhash hash_value = pyhash.murmur3_32(seed = i)(quote) tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1) # assigning # of tailing 0's as 0 if no 1's found in bin string if tail_0 == len(bin(hash_value)[2:]): tail_0 = 0 if tail_0 > max_0[i]: max_0[i] = tail_0 for i in range(20,30): # calculating # of tailing 0's for murmurhash hash_value = pyhash.xx_32(seed = i)(quote) tail_0 = len(bin(hash_value)[2:]) - (bin(hash_value)[2:].rfind('1') + 1) # assigning # of tailing 0's as 0 if no 1's found in bin string if tail_0 == len(bin(hash_value)[2:]): tail_0 = 0 if tail_0 > max_0[i]: max_0[i] = tail_0 return
def __init__(self, conf): self.hashes = conf["hashes"] self.b = conf["b"] self.r = conf["r"] self.band_seed = conf["band_seed"] self.seeds = conf["seeds"] self.hasher = pyhash.murmur3_32()
def __init__(self, conf): self.hashes = conf['hashes'] self.b = conf['b'] self.r = conf['r'] self.band_seed = conf['band_seed'] self.seeds = conf['seeds'] self.hasher = pyhash.murmur3_32()
def _run_minhash(A, B, data, seed, num_perm, b): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm, hashobj=Hash) m2 = MinHash(num_perm=num_perm, hashobj=Hash) for i in xrange(a_start, a_end): m1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): m2.update(hasher(data[i], seed=seed)) return [m1.jaccard(m2), _b_bit_minhash_jaccard(m1, m2, b)]
def _run_minhash(A, B, data, seed, num_perm): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return jaccard([m1, m2])
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p, hashobj=Hash) h2 = HyperLogLog(p=p, hashobj=Hash) for i in xrange(a_start, a_end): h1.update(hasher(data[i], seed=seed)) for i in xrange(b_start, b_end): h2.update(hasher(data[i], seed=seed)) return _hyperloglog_jaccard(h1, h2)
def _run_minhash(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=2**p) m2 = MinHash(num_perm=2**p) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return _minhash_inclusion(m1, m2)
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p) h2 = HyperLogLog(p=p) for i in xrange(a_start, a_end): h1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): h2.digest(Hash(hasher(data[i], seed=seed))) return _hyperloglog_inclusion(h1, h2)
def _run_minhash(A, B, data, seed, bs, num_perm): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return [m1.jaccard(m2)] + \ [_b_bit_minhash_jaccard(m1, m2, b) for b in bs]
def __init__(self,filter_size): ''' First, we initialize the Bloom filter and create a bit array of filter_size entered by the user. In addition, we initialize each of the hash functions (Murmur3 and FNV-1) of our filter. filter_size: size of the vector ''' self.filter = filter_size self.bit_array = bitarray(filter_size) self.bit_array.setall(0) #since bitarray doesn't ensure that all bits are set to 0 self.hasher1 = pyhash.fnv1_32() self.hasher2 = pyhash.murmur3_32()
def __init__(self, set_data_path: str, hash_mod: int = 10000, number_of_hashes: int = 20): self.set_data_path = set_data_path self.hasher = pyhash.murmur3_32() self.bucket_numbers = {} if os.path.exists(SETS_PATH % self.set_data_path): with open(SETS_PATH % self.set_data_path, "rb") as sets_file: self.hashing_sets, self.bucket_numbers = pickle.load(sets_file) else: self.hashing_sets = [set() for _ in range(number_of_hashes)] self.hash_mod = hash_mod self.contains_lru = deque(maxlen=CONTAINS_CACHE_SIZE) self.add_lru = deque(maxlen=MAX_UNCOMMITED) self.writeahead_log = self._writeahead_init() self.log_uncommited = []
def __init__(self, set_data_path: str, hash_mod: int = 10000, number_of_hashes: int = 20, commit_number: Optional[int] = None, recover_state_on_init: bool = False): super().__init__() self.set_data_path = set_data_path self.hash_mod = hash_mod self.number_of_hashes = number_of_hashes self.contains_lru = deque(maxlen=CONTAINS_CACHE_SIZE) self.add_lru = deque(maxlen=ADD_LRU) self.writeahead_log = None self.hasher = pyhash.murmur3_32() self.prepare_buffer = [] self.hashing_sets = [] self.bucket_numbers = {} # This allows an invalid state in the object, sorry fontela :( if recover_state_on_init: self.recover_state(commit_number)
def __init__(self): self.DIVISOR = 20 self.big_vector = [0] * self.DIVISOR self.fnv = fnv1_32() self.murmur = murmur3_32()
def __init__(self, dim, threshold): self.dim = dim self.threshold = threshold self.bandwidth = self.get_bandwidth(dim, threshold) self.hasher = pyhash.murmur3_32()
def __init__(self, dim, seeds=None): self.dim = dim self.seeds = self._set_seeds(seeds) self.hasher = pyhash.murmur3_32() self.hashes = self._hash_functions()
def __init__(self, path): self.path = path self.num_scanned = 0 Thread.__init__(self) self.hasher = pyhash.murmur3_32()
import pyhash bit_vector = [0] * 20 fnv = pyhash.fnv1_32() fnv = pyhash.murmur3_32()
import string from itertools import product NONE, PYHASH, PURE_PYTHON_HASH, CACHED = 0, 1, 2, 3 hash_mode = PURE_PYTHON_HASH if hash_mode == PYHASH: import pyhash hasher = pyhash.murmur3_32() def hash_function(seq): return hasher(seq) % 1000000 elif hash_mode == PURE_PYTHON_HASH: def hash_integer(elem): # RanHash v = elem * 3935559000370003845 + 2691343689449507681; v = v ^ (v >> 21); v = v ^ (v << 37); v = v ^ (v >> 4); v = v * 4768777513237032717; v = v ^ (v << 20); v = v ^ (v >> 41); v = v ^ (v << 5); return v def hash_function(seq): return hash_integer(hash(seq)) % 1000000 hash_dict = None def precompute_hashes(m): global hash_dict, hash_mode if hash_mode == NONE: return
# Section 2 - Bloom Implementation """ !pip install bitarray from bitarray import bitarray !pip install pyhash import pyhash from pyhash import murmur3_32 #importing non-crypographic hash functions from pyhash import fnv1_32 fnv_hasher = pyhash.fnv1_32() murmur_hasher = pyhash.murmur3_32() class Bloom(): """Bloom Filter""" def __init__(self,filter_size): ''' First, we initialize the Bloom filter and create a bit array of filter_size entered by the user. In addition, we initialize each of the hash functions (Murmur3 and FNV-1) of our filter. filter_size: size of the vector ''' self.filter = filter_size self.bit_array = bitarray(filter_size) self.bit_array.setall(0) #since bitarray doesn't ensure that all bits are set to 0
def setUp(self): self.unigrams = set() self.bigrams = set() self.char_trigrams = set() self.cat_tokens = set() self.attr_tokens = set() self.hasher = pyhash.murmur3_32() self.unknown_bin = 16 self.feature_precomputed = { "reviews": [0.0, 3437.0], "rating": [0.0, 100.0], "sales_monthly": [0.0, 14345.0], "sales_yearly": [0.0, 136592.0], "support_p2h_delivery": [0.0, 1.0] } self.header_fields = [ "reviews", "rating", "sales_monthly", "sales_yearly", "support_p2h_delivery" ] p1 = { 'product_name': 'Ổ Cứng SSD Kingston HyperX FURY 120GB - SATA III - Hàng Chính Hãng', 'brand': 'Kingston', 'author': '', 'attributes': '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch', 'categories': '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ', 'reviews': 100, 'rating': 80, 'sales_monthly': 20, 'sales_yearly': 100, 'support_p2h_delivery': 1 } p2 = { 'product_name': 'Ổ Cứng SSD Sata III 2.5 Inch Samsung 850 EVO 120GB - Hàng Chính Hãng', 'brand': 'Samsung', 'author': '', 'attributes': '1165#filter_ssd_storage#120 GB - 128 GB|1166#filter_ssd_product_size#2.5 inch', 'categories': '1846#2#Laptop - Máy Vi Tính - Linh kiện|8060#3#Thiết bị lưu trữ', 'reviews': 71, 'rating': 95, 'sales_monthly': 10, 'sales_yearly': 50, 'support_p2h_delivery': 1 } p3 = { 'product_name': 'Mai Em Vào Lớp 1 - Vở Tập Tô Chữ (Dành Cho Trẻ 5 - 6 Tuổi) - Tập 1', 'brand': '', 'author': ['Lê Hồng Đăng - Lê Thị Ngọc Ánh'], 'attributes': '', 'categories': '8322#2#Nhà Sách Tiki|316#3#Sách tiếng Việt |393#4#Sách thiếu nhi |853#5#Kiến thức - Bách khoa', 'reviews': 0, 'rating': 0, 'sales_monthly': 3, 'sales_yearly': 10, 'support_p2h_delivery': 1 } self.products = [p1, p2, p3] self.queries = ['ổ cứng', 'samsung 850', 'vở tập tô'] self.target = [2, 1, 0] # positive, impressed, negative self.weights = [1., 1., 1.] for p in self.products: self.add_to_vocab(query_preprocessing(p['product_name'])) for z in p['attributes'].split('|'): t = "#".join(z.split("#")[:2]) self.attr_tokens.add(t) for z in p['categories'].split('|'): t = "#".join(z.split("#")[:2]) self.cat_tokens.add(t) self.vocab = self.unigrams.copy() self.vocab = self.vocab.union(self.bigrams, self.char_trigrams) self.vocab = list(self.vocab) self.zero_idx = len(self.vocab) + self.unknown_bin self.token_2_idx = {} for i, t in enumerate(self.vocab): self.token_2_idx[t] = i self.cat_token_2_idx = {} for i, t in enumerate(self.cat_tokens): self.cat_token_2_idx[t] = i self.cat_zero_idx = len(self.cat_tokens) self.attr_token_2_idx = {} for i, t in enumerate(self.attr_tokens): self.attr_token_2_idx[t] = i self.attr_zero_idx = len(self.attr_tokens) self.embed_size = 80 self.attr_cat_embed_size = 10 self.vocab_size = len(self.token_2_idx) self.max_query_length = 25 self.max_product_length = 50 self.max_brand_length = 25 self.max_author_length = 25 self.max_attr_length = 10 self.max_cat_length = 10 self.filter_sizes = [2, 3, 4, 5] self.num_filters = 5
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p) for d in data: h.digest(Hash(hasher(d, seed=seed))) return h.count()
def __init__( self, pair_paths, precomputed_path, product_db, vocab_path, cat_tokens_path, attr_tokens_path, maximums_query=[25, 25, 125],#for unigram, bigram, character trigrams maximums_product_name=[50, 50, 250], #for unigram, bigram, character trigrams maximums_brand=[10, 10, 50], maximums_author=[10, 10, 50], maximums_cat=[10, 10, 20], #for unigram, bigram, character trigrams maximums_attr=[10, 10, 20], #for unigram, bigram, character trigrams unknown_bin=8012): self.vocab = [] with open(vocab_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.vocab.append(l.strip()) self.cat_tokens = [] with open(cat_tokens_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.cat_tokens.append(l.strip()) self.attr_tokens = [] with open(attr_tokens_path, 'r') as fobj: for l in fobj: if len(l.strip()): self.attr_tokens.append(l.strip()) with open(precomputed_path, 'r') as fobj: self.precomputed = json.load(fobj) self.vocab_size = len(self.vocab) self.cat_tokens_size = len(self.cat_tokens) self.attr_tokens_size = len(self.attr_tokens) self.unknown_bin = unknown_bin self.maximums_query = maximums_query self.maximums_product_name = maximums_product_name self.maximums_brand = maximums_brand self.maximums_author = maximums_author self.maximums_cat = maximums_cat self.maximums_attr = maximums_attr self.token_2_idx = {} self.cat_token_2_idx = {} self.attr_token_2_idx = {} self.zero_idx = len(self.vocab) + self.unknown_bin for i, w in enumerate(self.vocab): self.token_2_idx[w] = i self.cat_zero_idx = len(self.cat_tokens) for i, w in enumerate(self.cat_tokens): self.cat_token_2_idx[w] = i self.attr_zero_idx = len(self.attr_tokens) for i, w in enumerate(self.attr_tokens): self.attr_token_2_idx[w] = i self.hasher = pyhash.murmur3_32() # initialize sampling pools self.pair_paths = pair_paths self.precomputed_path = precomputed_path # self.conn = create_connection(product_db) # self.headers = get_fields(self.conn) if product_db: self.product_dict = {} with open(product_db, "r") as fobj: csv_reader= csv.DictReader(fobj) for i, r in enumerate(csv_reader): r = dict(r) r["name"] = query_preprocessing(r.get("name")) r["brand"] = query_preprocessing(r.get("brand")) r["author"] = query_preprocessing(r.get("author")) self.product_dict[r.get("product_id")] = r if i % 100000 == 0: print("Loaded %d products" % i) self.product_ids = list(self.product_dict.keys())
def _run_minhash(data, seed, p): hasher = pyhash.murmur3_32() m = MinHash(num_perm=2**p) for d in data: m.digest(Hash(hasher(d, seed=seed))) return m.count()
# Copyright (C) 2019 Intel Corporation # # SPDX-License-Identifier: MIT import os.path as osp from pyhash import murmur3_32 from datumaro.cli.util import make_file_name hasher = murmur3_32() def get_color_from_index(index): def get_bit(number, index): return (number >> index) & 1 color = [0, 0, 0] for j in range(7, -1, -1): for c in range(3): color[c] |= get_bit(index, c) << j index >>= 3 return tuple(color) DEFAULT_COLORMAP_CAPACITY = 2000 DEFAULT_COLORMAP_PATH = osp.join(osp.dirname(__file__), 'predefined_colors.txt') def parse_default_colors(file_path=None): if file_path is None: file_path = DEFAULT_COLORMAP_PATH colors = {}
def __init__(self, by_key: str, shards: int): self.by_key = by_key self.shards = shards self.hasher = pyhash.murmur3_32()
import pyhash as ph # Non cryptographic hash functions (Murmur and FNV) fnv = ph.fnv1_32() murmur = ph.murmur3_32() # Calculate the output of FNV and Murmur hash functions for pikachu and chamander bit_vector = [0] * 20 fnv_pika = fnv("Pikachu") % 20 murmur_pika = murmur("Pikachu") % 20 fnv_char = fnv("Charmander") % 20 murmur_char = murmur("Charmander") % 20 print("fnv_pika\t", fnv_pika) print("fnv_char\t", fnv_char) print("murmur_pika\t", murmur_pika) print("murmur_char\t", murmur_char) bit_vector[fnv_char] = 1 bit_vector[fnv_pika] = 1 bit_vector[murmur_char] = 1 bit_vector[murmur_pika] = 1 print(bit_vector) # Se um deles der 0 ou os dois, o item não está no bloom filter
sys.stdout.write("{0}: {1}\n".format(prog, msg)) def echo(msg): sys.stdout.write(msg + "\n") def int_as_bytearray(i): # Doing this avoids the leading 0x and trailing L (on longs) that # hex() adds. as_hex = "%x" % i # The ``if... else...`` bit is to ensure that we have an even # number of digits. return bytearray.fromhex(as_hex if not len(as_hex) % 2 else "0" + as_hex) hasher = murmur3_32() def get_hash(src): return base64.urlsafe_b64encode(int_as_bytearray(hasher(src))) \ .rstrip("=") def get_src_path(working_dir): return os.path.join(working_dir, "src") class Command(object): def __init__(self, args): """ A command on the command line. This backup software takes a command as its first argument. """ self.args = args
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p, hashobj=Hash) for d in data: h.update(hasher(d, seed=seed)) return h.count()
def _run_minhash(data, seed, p): hasher = pyhash.murmur3_32() m = MinHash(num_perm=2**p, hashobj=Hash) for d in data: m.update(hasher(d, seed=seed)) return m.count()
def bloomIPs(clientSize): """ I will implement the IP blocker above example. For simplicity, lets assume our IP system is composed of values between 0-100000 """ #My bit vector bitVector = [0] * clientSize """ We use 3 hash functions, Murmur, FNV and metro hash systems. They are non cryptographic hence will return the same value any time we pass in the same value. We modularize them by our bit Vector size to make them fit into it as it is our reference sheet. """ fnv = pyhash.fnv1_32() murmur = pyhash.murmur3_32() metro = pyhash.metro_128() """ Now lets imagine we identified a set of just 1000 hackers in our world. Their IPs range from 0 to 1000 as follows. """ hackerSize = 1000 hackersList = range(0, hackerSize) """ To keep our random clients happy, we come up with a repo of all the hackers Known. Our customers are very stubborn but they love being safe. Its a very dangerous world out there. We mark the hackers. """ for hacker in hackersList: #Hash them with our 3 functions bitVector[fnv(str(hacker)) % clientSize] = 1 bitVector[murmur(str(hacker)) % clientSize] = 1 bitVector[metro(str(hacker)) % clientSize] = 1 """ Now our true clients make requests. We have say 100000 of them. We look them up in our list and determine if they are hackers or not An approved request is marked as Perfect. Lets count, of the 700, False Positives are clients Noted as Hackers How many will be marked perfect """ perfect = 0 falsePositive = 0 for cust in range(0, 100000): trueClient = random.randrange(10000, 100000) check1 = bitVector[fnv(str(trueClient)) % clientSize] check2 = bitVector[murmur(str(trueClient)) % clientSize] check3 = bitVector[metro(str(trueClient)) % clientSize] #print("{}-{}-{}").format(check1,check2,check3) """ We will not grant perfection to them if they are detected as hackers by any of our security systems, we mark the false positive. Othewise we just think of them as false negatives """ checkFinale = (check1 == check2 == check3 == 1) if checkFinale is True: falsePositive += 1 else: perfect += 1 doc = """ Running our check, we wil throw warnings to some true clients thinking they are hackers when they are in fact not. Run it again to see how many true clients connect. At least we know they are safe. But as seen. there is a possibility of our clients, whose IPs are not even in the same range as the hackers to be detected as hackers.""" return { "hackerSize": hackerSize, "falsePositive": falsePositive, "clientSize": clientSize, "doc": doc, "perfect": perfect }
#https://code.google.com/p/pyfasthash/ h_fnv1_32 = pyhash.fnv1_32() def fnv1_32(req): return h_fnv1_32(str(req)) h_lookup3 = pyhash.lookup3_big() def lookup3(req): return h_lookup3(str(req)) h_super_fast_hash = pyhash.super_fast_hash() def super_fast_hash(req): return h_super_fast_hash(str(req)) h_murmur2_x64_64a = pyhash.murmur2_x64_64a() def murmur2_x64_64a(req): return h_murmur2_x64_64a(str(req)) h_murmur3_32 = pyhash.murmur3_32() def murmur3_32(req): return h_murmur3_32(str(req)) h_fnv1a_64 = pyhash.fnv1a_64() def fnv1a_64(req): return h_fnv1a_64(str(req))
is not EG: I run website and want to keep track of IP addresses that are blocked. I dont care if a blocked IP is occasionally able to access my website, but I do care if someone not on the blocked list is unable to access the site bit_vector is list of bits """ import pyhash bit_vector = [0] * 20 #Non Cryptographic hash functions (murmer and FNV) fnv = pyhash.fnv1_32() murmur = pyhash.murmur3_32() #Calculate the output of FNV and Murmur hash functions for Pikachu and Charmander fnv_pika = fnv("Pikachu") % 20 fnv_char = fnv("Charmander") % 20 murmur_pika = murmur("Pikachu") % 20 murmur_char = murmur("Charmander") % 20 bit_vector[fnv_pika] = 1 bit_vector[murmur_pika] = 1 bit_vector[fnv_char] = 1 bit_vector[murmur_char] = 1 #print(fnv_pika)
def test_default_string_type(): hasher = pyhash.murmur3_32() assert hasher('foo') == hasher(u'foo') assert hasher('foo') != hasher(b'foo')
def worker(wid, queue, csv_queue, limit_sample, batch_size, precomputed_path, product_db, vocab_path, cat_tokens_path, attr_tokens_path, maximums_query, maximums_product_name, maximums_brand, maximums_author, maximums_cat, maximums_attr, unknown_bin): hasher = pyhash.murmur3_32() def unknown_to_idx(): def _inside(unknown): return hasher(unknown) % unknown_bin return _inside meta_inst = MetaData(precomputed_path, product_db, vocab_path, cat_tokens_path, attr_tokens_path, maximums_query, maximums_product_name, maximums_brand, maximums_author, maximums_cat, maximums_attr, unknown_bin) product_ids = meta_inst.product_ids print("Data worker %d started" % wid) total_sample = 0 while True: if queue.qsize() > 1000: time.sleep(0.1) continue for _ in range(64): queries = [] labels = [] products = [] qids = [] count_keyword = 0 unique_queries = [] count_qs = [] count_t = 0 for k in range(batch_size): r = csv_queue.get() keyword = r[0] r1 = r[1] if len(keyword) == 0: continue count_keyword += 1 pk = r1.split("|") pnk = [z.split("#") for z in pk] pos = [] zero = [] neg = [] for p in pnk: if p[1] == '2': pos.append(p[0]) elif p[1] == '1': zero.append(p[0]) else: neg.append(p[0]) n = min(1, len(pos)) if n > 6: n = 4 pos = random.sample(pos, n) if n == 0: n = len(zero) if n > 6: n = 4 zero = random.sample(zero, n) if n: neg = random.sample(product_ids, n * 7) + random.sample( neg, min(len(neg), 8)) pass else: continue else: zero = random.sample(zero, min(len(zero), n * 6)) neg = random.sample(product_ids, n * 7) + random.sample( neg, min(len(neg), 8)) count_q = 0 for samples, l in zip([pos, zero, neg], [2, 1, 0]): for s in samples: product = meta_inst.get_product(s) if product: count_q += 1 queries.append(keyword) qids.append(count_keyword) products.append(product) labels.append(l) if count_q: unique_queries.append(keyword) count_qs.append(count_q) query_unigram_indices = [] query_bigram_indices = [] query_char_trigram_indices = [] for q, r in zip(unique_queries, count_qs): u, b, t = \ convert_strings( [q], meta_inst.token_2_idx, meta_inst.zero_idx, meta_inst.maximums_query[0], meta_inst.maximums_query[1], meta_inst.maximums_query[2], unknown_to_idx()) query_unigram_indices.append(np.tile(u, (r, 1))) query_bigram_indices.append(np.tile(b, (r, 1))) query_char_trigram_indices.append(np.tile(t, (r, 1))) query_unigram_indices = np.concatenate(query_unigram_indices, axis=0) query_bigram_indices = np.concatenate(query_bigram_indices, axis=0) query_char_trigram_indices = np.concatenate( query_char_trigram_indices, axis=0) product_unigram_indices = [] product_bigram_indices = [] product_char_trigram_indices = [] brand_unigram_indices = [] brand_bigram_indices = [] brand_char_trigram_indices = [] author_unigram_indices = [] author_bigram_indices = [] author_char_trigram_indices = [] cat_tokens = [] cat_in_product = [] cat_unigram_indices = [] cat_bigram_indices = [] cat_char_trigram_indices = [] attr_tokens = [] attr_in_product = [] attr_unigram_indices = [] attr_bigram_indices = [] attr_char_trigram_indices = [] features = [] for p in products: product_unigram_indices.append( np.frombuffer(p.get("product_unigram_indices"), dtype=np.int32)) product_bigram_indices.append( np.frombuffer(p.get("product_bigram_indices"), dtype=np.int32)) product_char_trigram_indices.append( np.frombuffer(p.get("product_char_trigram_indices"), dtype=np.int32)) brand_unigram_indices.append( np.frombuffer(p.get("brand_unigram_indices"), dtype=np.int32)) brand_bigram_indices.append( np.frombuffer(p.get("brand_bigram_indices"), dtype=np.int32)) brand_char_trigram_indices.append( np.frombuffer(p.get("brand_char_trigram_indices"), dtype=np.int32)) author_unigram_indices.append( np.frombuffer(p.get("author_unigram_indices"), dtype=np.int32)) author_bigram_indices.append( np.frombuffer(p.get("author_bigram_indices"), dtype=np.int32)) author_char_trigram_indices.append( np.frombuffer(p.get("author_char_trigram_indices"), dtype=np.int32)) cat_tokens.append( np.frombuffer(p.get("cat_tokens"), dtype=np.int32)) cip = int( np.frombuffer(p.get("cat_in_product"), dtype=np.int32)) cat_in_product.append(cip) cat_unigram_indices.append( np.reshape( np.frombuffer(p.get("cat_unigram_indices"), dtype=np.int32), (cip, meta_inst.maximums_cat[0]))) cat_bigram_indices.append( np.reshape( np.frombuffer(p.get("cat_bigram_indices"), dtype=np.int32), (cip, meta_inst.maximums_cat[1]))) cat_char_trigram_indices.append( np.reshape( np.frombuffer(p.get("cat_char_trigram_indices"), dtype=np.int32), (cip, meta_inst.maximums_cat[2]))) attr_tokens.append( np.frombuffer(p.get("attr_tokens"), dtype=np.int32)) aip = int( np.frombuffer(p.get("attr_in_product"), dtype=np.int32)) attr_in_product.append(aip) attr_unigram_indices.append( np.reshape( np.frombuffer(p.get("attr_unigram_indices"), dtype=np.int32), (aip, meta_inst.maximums_attr[0]))) attr_bigram_indices.append( np.reshape( np.frombuffer(p.get("attr_bigram_indices"), dtype=np.int32), (aip, meta_inst.maximums_attr[1]))) attr_char_trigram_indices.append( np.reshape( np.frombuffer(p.get("attr_char_trigram_indices"), dtype=np.int32), (aip, meta_inst.maximums_attr[2]))) features.append( np.frombuffer(p.get("features"), dtype=np.float32)) product_unigram_indices = np.stack(product_unigram_indices) product_bigram_indices = np.stack(product_bigram_indices) product_char_trigram_indices = np.stack( product_char_trigram_indices) brand_unigram_indices = np.stack(brand_unigram_indices) brand_bigram_indices = np.stack(brand_bigram_indices) brand_char_trigram_indices = np.stack(brand_char_trigram_indices) author_unigram_indices = np.stack(author_unigram_indices) author_bigram_indices = np.stack(author_bigram_indices) author_char_trigram_indices = np.stack(author_char_trigram_indices) cat_tokens = np.concatenate(cat_tokens) cat_in_product = np.array(cat_in_product, dtype=np.int32) cat_unigram_indices = np.concatenate(cat_unigram_indices, axis=0) cat_bigram_indices = np.concatenate(cat_bigram_indices, axis=0) cat_char_trigram_indices = np.concatenate(cat_char_trigram_indices, axis=0) attr_tokens = np.concatenate(attr_tokens) attr_in_product = np.array(attr_in_product, dtype=np.int32) attr_unigram_indices = np.concatenate(attr_unigram_indices, axis=0) attr_bigram_indices = np.concatenate(attr_bigram_indices, axis=0) attr_char_trigram_indices = np.concatenate( attr_char_trigram_indices, axis=0) features = np.stack(features) labels = np.asarray(labels, dtype=np.int32) qids = np.asarray(qids, dtype=np.int32) queue.put([ query_unigram_indices, query_bigram_indices, query_char_trigram_indices, product_unigram_indices, product_bigram_indices, product_char_trigram_indices, brand_unigram_indices, brand_bigram_indices, brand_char_trigram_indices, author_unigram_indices, author_bigram_indices, author_char_trigram_indices, cat_tokens, cat_in_product, cat_unigram_indices, cat_bigram_indices, cat_char_trigram_indices, attr_tokens, attr_in_product, attr_unigram_indices, attr_bigram_indices, attr_char_trigram_indices, features, count_keyword, qids, labels ]) total_sample += 1 if total_sample > limit_sample: queue.put(None) break if total_sample > limit_sample: # queue.put(None) break meta_inst.conn.close() print("Worker-%d Exiting" % wid)