def decode_lz4_old_kafka(buff): """Decode buff for 0.8/0.9 brokers Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L258 """ assert xxhash is not None # Kafka's LZ4 code has a bug in its header checksum implementation header_size = 7 if isinstance(buff[4], int): flg = buff[4] else: flg = ord(buff[4]) content_size_bit = ((flg >> 3) & 1) if content_size_bit: header_size += 8 # This should be the correct hc hc = xxhash.xxh32(buff[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member munged_buff = b''.join([ buff[0:header_size-1], hc, buff[header_size:] ]) return decode_lz4(munged_buff)
def list_n_neighbours(g_seq, l_seq, blocks=1000, leaf_size=200, nn=10): g_aln = [x for x in g_seq.values()] l_aln = [x for x in l_seq.values()] genome_size = len(g_aln[0].seq) ## only works for aligned sequences block_size = int(genome_size / blocks) if (block_size < 1): block_size = 1 logger.info("Creating a hashed genome with blocks of %s bases", str(block_size)) g_hash = [[ xxhash.xxh32(str(g_aln[j].seq[i:i + block_size])).intdigest() for i in range(0, genome_size, block_size) ] for j in range(len(g_aln))] btre = BallTree( np.array(g_hash), leaf_size=leaf_size, metric='hamming') # create a neighbours tree of global sequences logger.info("And finding %s closest neighbours", str(nn)) if (nn < 2): logger.warning( "Closest neighbour will be itself, if already on BallTree; useful only on two independent sets" ) l_hash = [[ xxhash.xxh32(str(l_aln[j].seq[i:i + block_size])).intdigest() for i in range(0, genome_size, block_size) ] for j in range(len(l_aln))] dist, idx = btre.query(l_hash, k=nn, return_distance=True) # return_distance is free clusters = list(set([g_aln[j].id for x in idx for j in x ])) # one-dimentional list of all global neighbours del g_aln, g_hash, l_aln, l_hash, btre, idx return clusters
def hashLocation(authTicket, latitude, longitude, altitude): baseHash = xxhash.xxh32(authTicket.SerializeToString(), seed=0x1B845238).intdigest() locationBytes = d2h(latitude) + d2h(longitude) + d2h(altitude) # Using serialized Auth Ticket hashA = xxhash.xxh32(locationBytes, seed=baseHash).intdigest() # Hash of location using static seed 0x1B845238 hashB = xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest() return hashA, hashB
def bb_hash(self): node_hash = xxhash.xxh32() relationship_hash = xxhash.xxh32() for disasm_text in self.bb.disassembly_text: if 'sub_' not in str(disasm_text): node_hash.update(str(disasm_text)) relationship_hash.update( str(self.parent_func_uuid) + str(self.parent_bb) + str(self.UUID)) return node_hash.intdigest(), relationship_hash.intdigest()
def hashLocation(authTicket, latitude, longitude, altitude): baseHash = xxhash.xxh32(authTicket.SerializeToString(), seed=0x1B845238).intdigest() # Format location locationBytes = d2h(latitude) + d2h(longitude) + d2h(altitude) # Using serialized Auth Ticket hashA = xxhash.xxh32(locationBytes, seed=baseHash).intdigest() # Hash of location using static seed 0x1B845238 hashB = xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest() return hashA, hashB
def inCache(self, key, val, var_type): ''' Check if variable is in cache by hashing ''' hsh = '' if var_type == np.ndarray: arr_bytes = bytes(val.data) hsh = xxhash.xxh32(arr_bytes).hexdigest() + str(val.shape) else: hsh = xxhash.xxh32(JSON.dumps(val).encode('utf8')).hexdigest() if key in self.cache and hsh == self.cache[key]: return True, hsh return False, hsh
def test_xxh32_overflow(self): a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=0) b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**32) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=1) b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**32 + 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**33 - 1) b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**34 - 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) a = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**65 - 1) b = xxhash.xxh32('I want an unsigned 32-bit seed!', seed=2**66 - 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest())
def perturb(self, x, p): if self.bern_ps[p]: #x_hash= (xxhash.xxh32(self.hash_cache[x], seed=p).intdigest()) % self.g pert_val = (xxhash.xxh32(self.hash_cache[x], seed=p).intdigest()) % self.g else: pert_val = self.uni_dist[p] dom_index = 0 while dom_index < self.sz: if pert_val == (xxhash.xxh32(self.hash_cache[dom_index], seed=p).intdigest() % self.g): self.estimate[dom_index] += 1.0 dom_index += 1
def __init__(self, epsilon, d, k, g=2, use_olh=True, index_mapper=None, hash_matrix=None): """ Args: epsilon: float - The privacy budget d: integer - Size of the data domain k: integer - The number of hash functions to use. Larger k results in a more accurate oracle at the expense of computation time. g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing) use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1) index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain hash_matrix: Optional matrix - Allows the use of a pre-computed hash matrix that contains hashed domain elements """ self.k = k super().__init__(epsilon, d, g, use_olh, index_mapper=index_mapper) self.hash_counts = np.zeros((self.k, self.g)) # g = lambda i,j: xxhash.xxh32(str(int(j)), seed=int(i)).intdigest() % self.g if hash_matrix is None: matrix = np.empty((self.k, self.d)) for i in range(0, self.k): for j in range(0, self.d): matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g # self.hash_matrix = np.fromfunction(g, (self.k, self.d)) self.hash_matrix = matrix else: self.hash_matrix = hash_matrix
def _compress_frame(self): ''' frame contains all the blocks, plus frame header and checksum ''' self.dst_file.write(self._frame_header()) def read_src(buf): return self.src_file.readinto(buf) self.src_buffer = bytearray(b'\0') * BLOCK_SIZE self.dst_buffer = bytearray( b'\0') * worst_case_block_length(BLOCK_SIZE) xxh = xxhash.xxh32(seed=0) nbytes = read_src(self.src_buffer) while nbytes != 0: block_len = lz4_compress_block( self.dst_buffer, memoryview(self.src_buffer)[0:nbytes]) self.dst_file.write(memoryview(self.dst_buffer)[0:block_len]) # only pinned buffer, not appropriate here xxh.update(bytes(self.src_buffer[0:nbytes])) nbytes = read_src(self.src_buffer) self.dst_file.write((0).to_bytes(4, 'little')) # EndMark self.dst_file.write(xxh.intdigest().to_bytes(4, 'little')) # CheckSum
def encode_lz4_old_kafka(buff): """Encode buff for 0.8/0.9 brokers -- requires an incorrect header checksum. Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L227 """ assert xxhash is not None data = encode_lz4(buff) header_size = 7 flg = data[4] if not isinstance(flg, int): flg = ord(flg) content_size_bit = ((flg >> 3) & 1) if content_size_bit: # Old kafka does not accept the content-size field # so we need to discard it and reset the header flag flg -= 8 data = bytearray(data) data[4] = flg data = bytes(data) buff = data[header_size + 8:] else: buff = data[header_size:] # This is the incorrect hc hc = xxhash.xxh32(data[0:header_size - 1]).digest()[-2:-1] # pylint: disable-msg=no-member return b''.join([data[0:header_size - 1], hc, buff])
def hash_netloc(netloc): m = xxhash.xxh32() m.update(netloc.encode("utf-8")) nlhash = m.intdigest() return nlhash
def get_hash(v, m, k): # using v, get k hashes in range [0,m] val = np.sum(v) this_hash = ( np.array([xxhash.xxh32(str(val + i)).intdigest() for i in range(k)]) - mn_hash) / (mx_hash - mn_hash) return this_hash * m
def __init__(self, name, passk): self.name = name self.__serc_id = passk self.__key = xxhash.xxh32(str(self.name), seed=self.__serc_id).hexdigest() self.__dict = {}
def get_modded_msyts(msg_sarc: sarc.SARC, lang: str = 'USen', tmp_dir: Path = util.get_work_dir() / 'tmp_text') -> (list, dict): """ Gets a list of modified game text files in a given message SARC :param msg_sarc: The message SARC to scan for changes. :type msg_sarc: class:`sarc.SARC` :param lang: The game language to use, defaults to USen. :type lang: str, optional :param tmp_dir: The temp directory to use, defaults to "tmp_text" in BCML's working directory. :type tmp_dir: class:`pathlib.Path`, optional :returns: Returns a tuple containing a list of modded text files and a dict of new text files with their contents. :rtype: (list of str, dict of str: bytes) """ hashes = get_msbt_hashes(lang) modded_msyts = [] added_msbts = {} write_msbts = [] for msbt in msg_sarc.list_files(): if any(exclusion in msbt for exclusion in EXCLUDE_TEXTS): continue m_data = msg_sarc.get_file_data(msbt) m_hash = xxhash.xxh32(m_data).hexdigest() if msbt not in hashes: added_msbts[msbt] = m_data elif m_hash != hashes[msbt]: write_msbts.append((tmp_dir / msbt, m_data.tobytes())) modded_msyts.append(msbt.replace('.msbt', '.msyt')) if write_msbts: pool = multiprocessing.Pool() pool.map(write_msbt, write_msbts) pool.close() pool.join() return modded_msyts, added_msbts
def pickValAndProp(new_pos, sudoku, r, ste_Hsh): try: val = 0 tried = set() x = xxhash.xxh32() noOfStates = len(ste_Hsh) i = 0 if (new_pos == [-1, -1]): return (-1, -1) common = np.intersect1d(r[0][locateSquareOfPos([new_pos[0],new_pos[1]],sudoku)]\ ,np.intersect1d(r[2][new_pos[0]],r[1][new_pos[1]])) for val in common: sudoku[new_pos[0]][new_pos[1]] = val x.update(sudoku) ste_Hsh.add(x.digest()) x.reset() if (noOfStates < len(ste_Hsh)): break if (noOfStates == len(ste_Hsh)): return (-1, -1) #remove from square r[0][locateSquareOfPos(new_pos, sudoku)].remove(val) #remove from col r[1][new_pos[1]].remove(val) #remove from row r[2][new_pos[0]].remove(val) #print("no of states",noOfStates) return (sudoku, r) except: return (-1, -1)
def _load_definition(self, uri=None, text=None, allow_imports=True): if not uri and not text: raise ValueError("A schema uri or text must be defined") elif uri and text: raise ValueError( "You cannot specify multiple sources. Choose one: uri or text." ) if uri: if uri.startswith("http"): text = requests.get(uri).content else: uri = uri[len("file://"):] if uri.startswith( "file://") else uri with open(uri) as schema_file: text = schema_file.read() definition = yaml.safe_load(text) if allow_imports: self._add_imports(definition) metadata = definition.setdefault("__metadata__", {}) if not metadata.get("schema_version", None): metadata["schema_version"] = xxhash.xxh32(text).hexdigest() return definition
def listener(sok, monitor, rate_queue, timeout_params, progress, last_packet_num): timeout_interval = 0.5 while (len(monitor)): progress = 100 - ((len(monitor) - 1) * 100 / last_packet_num + 1) + 1 print("progress = {:.2f}%".format(progress)) try: sok.settimeout(timeout_interval) message, clientAddress = sok.recvfrom(2048) # sequence_num = int.from_bytes(message[0:4], byteorder='big') first_byte = message[0] # discard packet if message type is 0(handshake) if first_byte >> 7 & 1 == 0: continue # if hash is not correct, discard ack if message[4:8] != xxhash.xxh32(message[0:4]).digest(): continue else: sequence_num = (first_byte & int('7f', 16)).to_bytes( 1, byteorder='big') + message[1:4] sequence_num = int.from_bytes(sequence_num[0:4], byteorder='big') if sequence_num in monitor: # finding RTT sampleRTT = current_time() - monitor[sequence_num] if timeout_params.get('estRTT', -1) == -1: timeout_params['devRTT'] = 0 timeout_params['estRTT'] = sampleRTT else: timeout_params['estRTT'] = 0.125 * \ (sampleRTT) + 0.875*timeout_params['estRTT'] timeout_params['devRTT'] = 0.75 * \ timeout_params['devRTT'] + \ abs(timeout_params['estRTT']-sampleRTT) timeout_params['interval'] = timeout_params['estRTT'] + \ 4*timeout_params['devRTT'] monitor.pop(sequence_num, -1) # rate_queue.pop(sequence_num, -1) # if rate_queue.get(sequence_num, -1) != -1: # print("got ack for", sequence_num, "qlen", len(rate_queue), # "RTT", current_time() - rate_queue[sequence_num]) # else: # print("got ack for", sequence_num) except socket.timeout: print('will try after {} seconds'.format(timeout_interval)) timeout_interval *= 1.5 except ConnectionRefusedError: print('It seems the server is not online, sleeping for {} seconds'. format(timeout_interval)) time.sleep(timeout_interval) timeout_interval *= 1.5 finally: if timeout_interval > 10: print('no response since 10 seconds. exiting') monitor['disconnected'] = True exit() sok.close()
def show(pkg, filename): if os.path.exists(pkg) and os.path.isfile(pkg): fp = open(pkg, "r") fp.seek(3) count = readUInt32(fp) hashOff = readUInt32(fp) blockOff = readUInt32(fp) hashs = [] blocks = [] fp.seek(hashOff) for i in range(count): hashs.append(readUInt32(fp)) fp.seek(blockOff) for i in range(count): off = readUInt32(fp) length = readUInt32(fp) blocks.append((off, length)) print("blocks:") print(blocks) hashValue = xxhash.xxh32(filename, seed=0).intdigest() blk = blocks[hashs.index(hashValue)] fp.seek(blk[0]) detail = fp.read(blk[1]) print("file detail:%s" % detail.encode("hex")) flag = struct.unpack("=B", detail[0:1])[0] size = struct.unpack("=I", detail[1:5])[0] content = struct.unpack("={}s".format(blk[1] - 5), detail[5:])[0] decode = zlib.decompress(content) print(len(decode)) return decode
def XQqwlHlXKK(self, e, i): r = [] for o in range(16): r.append(92 ^ e[o]) n = xxhash.xxh32(b'', seed=0) s = xxhash.xxh32(b'', seed=0) n.update(bytes(r)) for o in range(16): r[o] ^= 106 s.update(bytes(r)) s.update(i) a = s.hexdigest() # is b8a7c677? n.update(bytes(self.pmAWhahfKx(a))) c = n.hexdigest() # is 3f97d2f6? d = self.pmAWhahfKx(c) return bytes(d)
def replace_values_metadata(df0): df = df0.copy() if "source_sex" in df.columns: df["source_sex"] = df["source_sex"].replace( ["Woman", "Female", "FEmale"], "F") df["source_sex"] = df["source_sex"].replace(["Male"], "M") df["source_sex"] = df["source_sex"].replace( ["Unknown", "unknwon", "U"], "?") if "adm2" in df.columns: df['adm2'] = df['adm2'].str.title() df["adm2"] = df["adm2"].replace(["Unknown Source", "Unknown"], "") #df["adm2"] = df["adm2"].replace({"Greater_London":"Greater London"}) # "Hertfordshire" != "Herefordshire" df["adm2"] = df["adm2"].replace( ["Greater_London"], "Greater London") # "Hertfordshire" != "Herefordshire" df['adm2'] = df['adm2'].map(lambda x: x if str(x) == "Norfolk" else "code" + str( xxhash.xxh32(str(x)).hexdigest()[:3]) ) ## no ADM2 leaves the servers #df['adm2'].fillna(df.country, inplace=True) if "is_icu_patient" in df.columns: df["is_icu_patient"] = df["is_icu_patient"].str.replace("Unknown", "?") #df["uk_lineage"] = df["uk_lineage"].replace(np.nan, "x", regex=True) return df
def get_msbt_hashes(lang: str = 'USen') -> {}: """ Gets the MSBT hash table for the given language, or US English by default :param lang: The game language to use, defaults to USen. :type lang: str, optional :returns: A dictionary of MSBT files and their vanilla hashes. :rtype: dict of str: str """ if not hasattr(get_msbt_hashes, 'texthashes'): get_msbt_hashes.texthashes = {} if lang not in get_msbt_hashes.texthashes: hash_table = util.get_exec_dir() / 'data' / 'msyt' / \ f'Msg_{lang}_hashes.csv' if hash_table.exists(): get_msbt_hashes.texthashes[lang] = {} with hash_table.open('r') as h_file: csv_loop = csv.reader(h_file) for row in csv_loop: get_msbt_hashes.texthashes[lang][row[0]] = row[1] elif util.get_game_file(f'Pack/Bootup_{lang}.pack').exists(): get_msbt_hashes.texthashes[lang] = {} with util.get_game_file(f'Pack/Bootup_{lang}.pack').open( 'rb') as b_file: bootup_pack = sarc.read_file_and_make_sarc(b_file) msg_bytes = util.decompress( bootup_pack.get_file_data( f'Message/Msg_{lang}.product.ssarc').tobytes()) msg_pack = sarc.SARC(msg_bytes) for msbt in msg_pack.list_files(): get_msbt_hashes.texthashes[lang][msbt] = xxhash.xxh32( msg_pack.get_file_data(msbt)).hexdigest() return get_msbt_hashes.texthashes[lang]
def encode_lz4_old_kafka(buff): """Encode buff for 0.8/0.9 brokers -- requires an incorrect header checksum. Reference impl: https://github.com/dpkp/kafka-python/blob/a00f9ead161e8b05ac953b460950e42fa0e0b7d6/kafka/codec.py#L227 """ assert xxhash is not None data = encode_lz4(buff) header_size = 7 flg = data[4] if not isinstance(flg, int): flg = ord(flg) content_size_bit = ((flg >> 3) & 1) if content_size_bit: # Old kafka does not accept the content-size field # so we need to discard it and reset the header flag flg -= 8 data = bytearray(data) data[4] = flg data = bytes(data) buff = data[header_size+8:] else: buff = data[header_size:] # This is the incorrect hc hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member return b''.join([ data[0:header_size-1], hc, buff ])
def lz4_encode_old_kafka(payload): """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum.""" assert xxhash is not None data = lz4_encode(payload) header_size = 7 flg = data[4] if not isinstance(flg, int): flg = ord(flg) content_size_bit = ((flg >> 3) & 1) if content_size_bit: # Old kafka does not accept the content-size field # so we need to discard it and reset the header flag flg -= 8 data = bytearray(data) data[4] = flg data = bytes(data) payload = data[header_size+8:] else: payload = data[header_size:] # This is the incorrect hc hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member return b''.join([ data[0:header_size-1], hc, payload ])
def threaded_compare_texts(msyt: Path, tmp_dir: Path, lang: str = 'USen') -> (str, dict): """Diffs texts in an MYST in a way suitable for multiprocessing""" rel_path = str(msyt.relative_to(tmp_dir)).replace('\\', '/') if lang in ['USen', 'EUen']: lang = 'XXen' try: with (tmp_dir / rel_path).open('r', encoding='utf-8') as mod_file: contents = mod_file.read() xhash = xxhash.xxh32(contents.encode('utf8')).hexdigest() if xhash == get_msyt_hashes()[lang][rel_path]: return rel_path, None import yaml.reader try: mod_text = yaml.safe_load(contents) except yaml.reader.ReaderError: err = ValueError( f'A character in {rel_path} could not be read') err.error_text = ( f'A character in {rel_path} could not be read. This probably means that the MSBT ' 'file is damaged or corrupt. You may need to report this to the mod\'s creator.' ) raise err except FileNotFoundError: return rel_path, None text_edits = {'entries': {}} hashes = get_entry_hashes()[lang] for entry, text in mod_text['entries'].items(): if entry not in hashes or hashes[entry] != get_entry_hash( text['contents']): text_edits['entries'][entry] = copy.deepcopy(text) return rel_path, text_edits
def minhash_faster_but_less_random(string_set): hashers = [xxhash.xxh32(w.encode('utf8')) for w in string_set] hashes = np.asarray([h.intdigest() for h in hashers]) while True: hashes *= 2654435761 hashes %= 2 ** 32 yield np.min(hashes)
def organize_dir(initial_path, db): # list of the content of directory listdir = [initial_path + "/" + fd for fd in os.listdir(initial_path)] # recursivity in all subfolder print("+ Recursivity...") for i in [d for d in listdir if os.path.isdir(d)]: organize_dir(i, db) #get media print("+ Get media") media = [{'path': f, 'date': get_date(f), 'size': os.stat(f).st_size} for f in listdir if os.path.isfile(f) and f.__contains__('.') and f.split('.')[-1].lower() in IMAGE_EXTENSION] #hash and insert in db print("+ Insert into db") import xxhash for m in media: print(m['path']) #calculate hash with open(m['path'], 'rb') as afile: digest = str(xxhash.xxh32(afile.read()).hexdigest()) if db.execute("select * from image where hash = ?", [digest]).fetchall().__len__() > 0: print("Already in db") db.execute("insert into image (path, year, month, day, hash, size) values (?,?,?,?,?,?) ;", [m['path'], m['date'].year, m['date'].month, m['date'].day, digest, m['size']]) print(str(m["date"]) + " " + digest) db.commit()
def __init__(self, i, p, channels, display): seed = getrandbits(32) self.fn = xxhash.xxh32(seed=seed) #self.total = 0 #i think this line can be removed self.i = i self.threshold = math.pow(2, i) #create vectors of a,b,c values for each channel #sum{j*x_j} self.a = np.zeros(channels, dtype=int) #sum{x_j} self.b = np.zeros(channels, dtype=int) #sum{x_j*r^j mod p} self.c = np.zeros(channels, dtype=int) self.p = p #TODO: make sure this randomness works safely, and doesn't, say, give #the same output each time you make a new RIS object self.r = randint(1, p - 1) #keep track of whether channels have been checked. If not, sampling will fail self.queryable = [False for j in range(channels)] #additionally keep track of linear combos of channels you may have checked #and whether they were queryable. If not, sampling will fail #this could get too big if the user is able to run check_linear_combo #for an arbitrary number of linear combinations of channels. That #might need to be fixed. self.linear_queryable = {} self.display = display
def extract_attribute(self, base_object: BDBasicBlock) -> Optional[Dict]: # Check if value already exists BasicBlockCallees_value = base_object.get_attribute_value( 'BasicBlockCallees') if BasicBlockCallees_value: pass else: names_hash = xxhash.xxh32() bb_start = base_object.underlying_obj.start bb_end = base_object.underlying_obj.end for call_site in base_object.underlying_obj.function.call_sites: if call_site.address in range(bb_start, bb_end): bv: BinaryView = base_object.underlying_obj.view for callee in bv.get_callees(call_site.address): callee_name: str = bv.get_function_at(callee).name if callee_name and not callee_name.startswith('sub_'): names_hash.update(callee_name) BasicBlockCallees_value = { 'callee_names_hash': names_hash.intdigest() } base_object.add_attribute_value('BasicBlockCallees', BasicBlockCallees_value) if names_hash.intdigest() == 0: log.log_debug( f'BasicBlockCallees: No names to extract, names_hash is 0') return BasicBlockCallees_value if BasicBlockCallees_value else None
def _perturb(self, data, seed): """ Used internally to perturb data using local hashing. Will hash the user's data item and then peturb it with probabilities that satisfy epsilon local differential privacy. Local hashing is explained in more detail here: https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf Args: data: User's data to be privatised seed: The seed for the user's hash function Returns: peturbed data """ index = self.index_mapper(data) # Taken directly from Wang (https://github.com/vvv214/LDP_Protocols/blob/master/olh.py#L55-L65) x = (xxhash.xxh32(str(index), seed=seed).intdigest() % self.g) y = x p_sample = np.random.random_sample() # the following two are equivalent # if p_sample > p: # while not y == x: # y = np.random.randint(0, g) if p_sample > self.p - self.q: # perturb y = np.random.randint(0, self.g) return y
def content_id_text(text, partial=False): # 1. Normalize (drop whitespace) text = text_normalize(text, keep_ws=False) # 2. Create 13 character n-grams ngrams = ("\u0020".join(l) for l in sliding_window(text, WINDOW_SIZE_CID_T)) # 3. Create 32-bit features with xxHash32 features = (xxhash.xxh32(s.encode("utf-8")).intdigest() for s in ngrams) # 4. Apply minimum_hash minhash = minimum_hash(features, n=64) # 5. Collect least significant bits of first 64 minhash signatures lsb = "".join([str(x & 1) for x in minhash]) # 6. Create 64-bit digests digest = int(lsb, 2).to_bytes(8, "big", signed=False) # 7. Prepend component header if partial: content_id_text_digest = HEAD_CID_T_PCF + digest else: content_id_text_digest = HEAD_CID_T + digest # 8. Encode and return return encode(content_id_text_digest)
def uid_shard(self, uid): try: uid_hash = xxhash.xxh32(str(uid), seed=101).intdigest() except: return -1 shard_idx = uid_hash % len(self.stub_list) return shard_idx
def find_or_load_beats(filename: str, loader: bm.beats.loader.BeatLoader) -> bm.Beats: h = xxhash.xxh32() with open(filename, "rb") as file: block = file.read(512) while block: h.update(block) block = file.read(512) d = h.digest() if d in song_cache: logger.info(f"Cache: hit {d}") try: return pickle.load(open(song_cache[d], "rb")) except Exception as e: logger.exception(f"Cache: failed to load {d}, falling through to miss", e) logger.info(f"Cache: miss {d}, creating...") beats = bm.Beats.from_song(filename, beat_loader=loader) beat_filename = f"{filename}_beats.pkl" with open(beat_filename, "wb") as fp: logger.info(f"Cache: creating entry {d} at {beat_filename}") pickle.dump(beats, fp) song_cache[d] = beat_filename return beats
def extract_attribute(self, base_object: BDFunction) -> Optional[Dict]: # Check if value already exists FunctionStringReferences_value = base_object.get_attribute_value( 'FunctionStringReferences') if FunctionStringReferences_value: pass else: strings_hash = xxhash.xxh32() current_function: Function = base_object.underlying_obj for addr in range(current_function.lowest_address, current_function.highest_address): const_refs = current_function.get_constants_referenced_by(addr) for ref in const_refs: string = current_function.view.get_string_at(ref.value) if string: strings_hash.update(string.value.encode('utf8')) FunctionStringReferences_value = { 'strings_hash': strings_hash.intdigest() } base_object.add_attribute_value('FunctionStringReferences', FunctionStringReferences_value) return FunctionStringReferences_value if FunctionStringReferences_value else None
def generate_hashes(frequency_idx, time_idx): """ ピークのインデックスを使ってハッシュを計算する """ # 次のピークとのハッシュを返す # 時間差の条件あり time_frequency = [] for f_idx, t_idx in zip(frequency_idx, time_idx): time_frequency.append([t_idx, f_idx]) # 時間順、周波数順にする time_frequency.sort() peak_len = len(time_frequency) for i in range(peak_len): for j in range(1, DETECT_PARAMETER.FAN_VALUE): if (i + j) < peak_len: freq1 = time_frequency[i][Frequency_idx] freq2 = time_frequency[i + j][Frequency_idx] t1 = time_frequency[i][Time_idx] t2 = time_frequency[i + j][Time_idx] time_delta = t2 - t1 if time_delta <= DETECT_PARAMETER.MAX_HASH_TIME_DELTA: h = xxhash.xxh32("%s|%s|%s" % (str(freq1), str(freq2), str(time_delta))) yield (h.hexdigest(), t1)
def hash_function(shingle, function_id): try: return xxhash.xxh32(shingle.encode("utf8") * function_id).intdigest() except Exception, e: print e print shingle sys.exit(-1)
def _hash(self, item): # get Python hash ID of object # technique used by Rafa Carrascosa # https://github.com/rafacarrascosa/countminsketch h = xxhash.xxh32(str(hash(item))) for i in range(self.num_rows): h.update(str(i)) yield h.intdigest() % self.num_columns
def test_XXH32_reset(self): x = xxhash.xxh32() h = x.intdigest() for i in range(10, 50): x.update(os.urandom(i)) x.reset() self.assertEqual(h, x.intdigest())
def _hash_with_seed(funcname, seed): seed = xxhash.xxh32(seed).intdigest() xxh32 = xxhash.xxh32 spooky32 = spooky.hash32 if funcname == 'xxhash32': return lambda x: xxh32(x, seed=seed).intdigest() elif funcname == 'spooky32': return lambda x: spooky32(x, seed=seed) else: raise ValueError('Unknown function name: %s' % funcname)
def _parse_header(self): # IMPORTANT: for simplicity, lz4 configuration is not fully supported buf = self.src_file.read(7) if len(buf) != 7 or int.from_bytes(buf[0:4], 'little') != MAGIC_NUMBER: raise BadFileError if buf[4] != int('01100100', 2): # FLG raise BadFileError if buf[5] != int('01110000', 2): # BD raise BadFileError checksum = xxhash.xxh32(buf[4:6], seed=0).digest()[2] if checksum != buf[6]: raise BadFileError
def test_XXH32(self): x = xxhash.xxh32() x.update('a') self.assertEqual(xxhash.xxh32('a').digest(), x.digest()) x.update('b') self.assertEqual(xxhash.xxh32('ab').digest(), x.digest()) x.update('c') self.assertEqual(xxhash.xxh32('abc').digest(), x.digest()) seed = random.randint(0, 2**32) x = xxhash.xxh32(seed=seed) x.update('a') self.assertEqual(xxhash.xxh32('a', seed).digest(), x.digest()) x.update('b') self.assertEqual(xxhash.xxh32('ab', seed).digest(), x.digest()) x.update('c') self.assertEqual(xxhash.xxh32('abc', seed).digest(), x.digest())
def _frame_header(self): header = bytearray() header += MAGIC_NUMBER.to_bytes(4, 'little') # default frame descriptor FLG, Version Number 01 # Block Independenc 1, Block Checksum 0 # Content Size 0, Content Checksum 1 FD_FLG = int('01100100', 2) # frame descriptor BD # Block Max Size 7 -> 4M FD_BD = int('01110000', 2) # frame descriptor header checksum checksum = xxhash.xxh32(bytes([FD_FLG, FD_BD]), seed=0).digest() FD_HC = checksum[2] header.append(FD_FLG) header.append(FD_BD) header.append(FD_HC) return header
def thread_affinity(url, total_worker_count): ''' Ensure only one client ever works on each netloc. This maintains better consistency of user-agents ''' # Only limit netlocs if we actually need to. if not getModuleForUrl(url).single_thread_fetch(url): return True netloc = urllib.parse.urlsplit(url).netloc m = xxhash.xxh32() m.update(netloc.encode("utf-8")) nlhash = m.intdigest() thread_aff = nlhash % total_worker_count # print("Thread affinity:", self.total_worker_count, self.worker_num, thread_aff, self.worker_num == thread_aff) return thread_aff
def lz4_encode(payload): data = lz4f.compressFrame(payload) # pylint: disable-msg=no-member # Kafka's LZ4 code has a bug in its header checksum implementation header_size = 7 if isinstance(data[4], int): flg = data[4] else: flg = ord(data[4]) content_size_bit = ((flg >> 3) & 1) if content_size_bit: header_size += 8 # This is the incorrect hc hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member return b''.join([ data[0:header_size-1], hc, data[header_size:] ])
def lz4_decode_old_kafka(payload): # Kafka's LZ4 code has a bug in its header checksum implementation header_size = 7 if isinstance(payload[4], int): flg = payload[4] else: flg = ord(payload[4]) content_size_bit = ((flg >> 3) & 1) if content_size_bit: header_size += 8 # This should be the correct hc hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member munged_payload = b''.join([ payload[0:header_size-1], hc, payload[header_size:] ]) return lz4_decode(munged_payload)
def lz4_encode_old_kafka(payload): """Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum.""" data = lz4_encode(payload) header_size = 7 if isinstance(data[4], int): flg = data[4] else: flg = ord(data[4]) content_size_bit = ((flg >> 3) & 1) if content_size_bit: header_size += 8 # This is the incorrect hc hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member return b''.join([ data[0:header_size-1], hc, data[header_size:] ])
def _extract_frame(self): self._parse_header() xxh = xxhash.xxh32(seed=0) while True: buf = self.src_file.read(4) block_len = int.from_bytes(buf, 'little') if block_len == 0: # end mark break buf = self.src_file.read(block_len) if len(buf) != block_len: raise BadFileError restored_block = bytearray() lz4_decompress_sequences(buf, restored_block) self.dst_file.write(restored_block) # only pinned buffer, not appropriate here xxh.update(bytes(restored_block)) buf = self.src_file.read(4) # xxh.digest will give a big endian result if int.from_bytes(buf, 'little') != xxh.intdigest(): raise BadFileError
def lz4_decode(payload): # Kafka's LZ4 code has a bug in its header checksum implementation header_size = 7 if isinstance(payload[4], int): flg = payload[4] else: flg = ord(payload[4]) content_size_bit = ((flg >> 3) & 1) if content_size_bit: header_size += 8 # This should be the correct hc hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member munged_payload = b''.join([ payload[0:header_size-1], hc, payload[header_size:] ]) cCtx = lz4f.createCompContext() # pylint: disable-msg=no-member data = lz4f.decompressFrame(munged_payload, cCtx) # pylint: disable-msg=no-member return data['decomp']
def hashdirectory(self,directory,map): hashfunc = xxhash.xxh32() for file in os.listdir(directory): if(os.path.isdir(os.path.join(directory,file))): #print os.path.join(directory,file) key = self.hashdirectory(os.path.join(directory,file),map) if key in map: map[key] = map[key] + "?"+os.path.join(directory,file) else: map[key] = os.path.join(directory,file) hashfunc.update(key) if(os.path.isfile(os.path.join(directory,file))): hf = xxhash.xxh64() f = open(os.path.join(directory,file),'rb').read() byts = bytes(f) #mem = memoryview(byts) buffersize = 1048576 bytesize = sys.getsizeof(byts) self.ldb.pgb.step(bytesize/1024) if bytesize-buffersize>0: for i in range(0,bytesize-buffersize,buffersize): if bytesize-i>buffersize: hf.update(byts[i:(i+buffersize)]) else: hf.update(byts[i:]) else: hf.update(byts[0:]) key = hf.digest() if key in map: map[key] = map[key] + "?"+os.path.join(directory,file) else: map[key] = os.path.join(directory,file) hashfunc.update(key) key = hashfunc.digest() return key
chars=string.letters + string.digits return ''.join([choice(chars) for i in range(length)]) def brokers_for_key(servers, key32bit): # (2**0) | 2**1 | 2**2 | 2**3 | 2**4 | 2**5 | 2**6 | 2**7 | 2**8 | 2**9 | 2**10 k1 = key32bit & 2047 # lower 10 bits # (2**11) | 2**12 | 2**13 | 2**14 | 2**15 | 2**16 | 2**17 | 2**18 | 2**19 | 2**20 | 2**21 k2 = key32bit & 4192256 # (2**22) | 2**23 | 2**24 | 2**25 | 2**26 | 2**27 | 2**28 | 2**29 | 2**30 | 2**31 | 2**32 k3 = key32bit & 8585740288 # print "Keys: ", (k1,k2,k3) server_size = len(servers) return (servers[k1 % server_size], servers[k2 % server_size], servers[k3 % server_size]) def any_brokers_equals(s1,s2,s3): return s1 == s2 or s1 == s3 or s2 == s3; # start with 100 servers cluster_servers = ips(10) # print "servers: ", cluster_servers for i in range(0, 1000): producer_key = key() rpc_chain_path = xxhash.xxh32(producer_key).intdigest() server_selection = brokers_for_key(cluster_servers, rpc_chain_path) (s1,s2,s3) = server_selection if any_brokers_equals(s1,s2,s3): print "Found a collision: ", (s1, s2, s3), ", rpc_chain: ", rpc_chain_path, ", producer key: ", producer_key
def test_xxh32(self): self.assertEqual(xxhash.xxh32('a').intdigest(), 1426945110) self.assertEqual(xxhash.xxh32('a', 0).intdigest(), 1426945110) self.assertEqual(xxhash.xxh32('a', 1).intdigest(), 4111757423)
def generate_location_hash_by_seed(authticket, lat, lng, acc=5): first_hash = xxhash.xxh32(authticket, seed=HASH_SEED).intdigest() location_bytes = d2h(lat) + d2h(lng) + d2h(acc) loc_hash = xxhash.xxh32(location_bytes, seed=first_hash).intdigest() return ctypes.c_int32(loc_hash).value
def generateLocation2(lat, lng, alt): locationBytes = d2h(lat) + d2h(lng) + d2h(alt) if not alt: alt = "\x00\x00\x00\x00\x00\x00\x00\x00" return xxhash.xxh32(locationBytes, seed=0x1B845238).intdigest() #Hash of location using static seed 0x1B845238
def generateLocation1(authticket, lat, lng, alt): firstHash = xxhash.xxh32(authticket, seed=0x1B845238).intdigest() locationBytes = d2h(lat) + d2h(lng) + d2h(alt) if not alt: alt = "\x00\x00\x00\x00\x00\x00\x00\x00" return xxhash.xxh32(locationBytes, seed=firstHash).intdigest()
def generate_location_hash(lat, lng, acc=5): location_bytes = d2h(lat) + d2h(lng) + d2h(acc) loc_hash = xxhash.xxh32(location_bytes, seed=HASH_SEED).intdigest() return ctypes.c_int32(loc_hash).value
def minhash(string_set): hashers = [xxhash.xxh32(w.encode('utf8')) for w in string_set] while True: yield min(h.intdigest() for h in hashers) for h in hashers: h.update('.')
def generateLocation2(lat, lng, alt): locationBytes = d2h(lat) + d2h(lng) + d2h(alt) return xxhash.xxh32(locationBytes, seed=static_seed).intdigest()
def generateLocation1(authticket, lat, lng, alt): firstHash = xxhash.xxh32(authticket, seed=static_seed).intdigest() locationBytes = d2h(lat) + d2h(lng) + d2h(alt) return xxhash.xxh32(locationBytes, seed=firstHash).intdigest()