def test_hash(self): """Test simple hashing.""" result = siphash("sixteencharstrng", "i need a hash of this") self.assertEqual(10796923698683394048, result) result = siphash("0123456789ABCDEF", "a") self.assertEqual(12398370950267227270, result)
def _map(self, collect_columns): """ Create Tuples of records in the Groups (GroupID, CollectedColumn, Value) The GroupID is a hash of the grouped columns, we do this because we don't actually care about the column values, just that we can uniquely identify records with the same values. For each column we're collecting, we emit a record of the column and the value in the column. This is akin to the MAP step in a MapReduce algo, we're creating a set of values which standardize the format of the data to be processed and could allow the data to be processed in parallel. """ if collect_columns == self._columns == {"*"}: # if we're doing COUNT(*), short-cut the processing self._group_keys["*"] = [("*", "*")] for record in self._dictset: yield ("*", "*", "*") return for record in self._dictset: try: group_key: cython.uint64_t = siphash( HASH_SEED, "".join([str(record[column]) for column in self._columns]), ) except KeyError: group_key: cython.uint64_t = siphash( HASH_SEED, "".join([ f"{record.get(column, '')}" for column in self._columns ]), ) if group_key not in self._group_keys.keys(): self._group_keys[group_key] = [(column, record.get(column)) for column in self._columns] if len(self._group_keys) >= 4999999: raise TooManyGroups( f"Groups are not selective enough and too many Groups have been found (stopped at {len(self._group_keys)})." ) for column in collect_columns: if column == "*": yield (group_key, column, "*") else: v = record.get(column) # ignore nulls if v is not None: yield (group_key, column, record[column])
def unpack(stream): while True: data = stream.read(4) if len(data) != 4: break (pktSize, ) = struct.unpack('<I', data) data = stream.read(8 + 2) if len(data) != 8 + 2: sys.stderr.write('short read') break (checksum, lenfname) = struct.unpack('<QH', data) fname = stream.read(lenfname) if len(fname) != lenfname: sys.stderr.write('short read') break data = stream.read(4) if len(data) != 4: sys.stderr.write('short read') break (fsize, ) = struct.unpack('<I', data) compressedSize = pktSize - 4 - 8 - 2 - 4 - lenfname data = stream.read(compressedSize) if len(data) != compressedSize: sys.stderr.write('short read') break data = lz4.uncompress(data) got = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data) if got == checksum: sys.stderr.write('%s: %d -> %d\n' % (fname, compressedSize, fsize)) else: sys.stderr.write('%s: checksum fail: got %d, want %d\n' % (fname, got, checksum))
def load_cursor(self, cursor): from bitarray import bitarray if cursor is None: return if isinstance(cursor, str): cursor = orjson.loads(cursor) if (not "location" in cursor.keys() or not "map" in cursor.keys() or not "partition" in cursor.keys()): raise InvalidCursor(f"Cursor is malformed or corrupted {cursor}") self.location = cursor["location"] find_partition = [ blob for blob in self.readable_blobs if siphash("%" * 16, blob) == cursor["partition"] ] if len(find_partition) == 1: self.partition = find_partition[0] map_bytes = bytes.fromhex(cursor["map"]) blob_map = bitarray() blob_map.frombytes(map_bytes) self.read_blobs = [ self.readable_blobs[i] for i in range(len(self.readable_blobs)) if blob_map[i] ]
def unpack(stream): while True: data = stream.read(4) if len(data) != 4: break (pktSize,) = struct.unpack('<I', data) data = stream.read(8+2) if len(data) != 8+2: sys.stderr.write('short read') break (checksum, lenfname) = struct.unpack('<QH', data) fname = stream.read(lenfname) if len(fname) != lenfname: sys.stderr.write('short read') break data = stream.read(4) if len(data) != 4: sys.stderr.write('short read') break (fsize,) = struct.unpack('<I', data) compressedSize = pktSize - 4 - 8 - 2 - 4 - lenfname data = stream.read(compressedSize) if len(data) != compressedSize: sys.stderr.write('short read') break data = lz4.uncompress(data) got = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data) if got == checksum: sys.stderr.write('%s: %d -> %d\n' % (fname, compressedSize, fsize)) else: sys.stderr.write('%s: checksum fail: got %d, want %d\n' % (fname, got, checksum))
def test_reference_vectors(self): vectors = [ 0x726fdb47dd0e0e31, 0x74f839c593dc67fd, 0x0d6c8009d9a94f5a, 0x85676696d7fb7e2d, 0xcf2794e0277187b7, 0x18765564cd99a68d, 0xcbc9466e58fee3ce, 0xab0200f58b01d137, 0x93f5f5799a932462, 0x9e0082df0ba9e4b0, 0x7a5dbbc594ddb9f3, 0xf4b32f46226bada7, 0x751e8fbc860ee5fb, 0x14ea5627c0843d90, 0xf723ca908e7af2ee, 0xa129ca6149be45e5, 0x3f2acc7f57c29bdb, 0x699ae9f52cbe4794, 0x4bc1b3f0968dd39c, 0xbb6dc91da77961bd, 0xbed65cf21aa2ee98, 0xd0f2cbb02e3b67c7, 0x93536795e3a33e88, 0xa80c038ccd5ccec8, 0xb8ad50c6f649af94, 0xbce192de8a85b8ea, 0x17d835b85bbb15f3, 0x2f2e6163076bcfad, 0xde4daaaca71dc9a5, 0xa6a2506687956571, 0xad87a3535c49ef28, 0x32d892fad841c342, 0x7127512f72f27cce, 0xa7f32346f95978e3, 0x12e0b01abb051238, 0x15e034d40fa197ae, 0x314dffbe0815a3b4, 0x027990f029623981, 0xcadcd4e59ef40c4d, 0x9abfd8766a33735c, 0x0e3ea96b5304a7d0, 0xad0c42d6fc585992, 0x187306c89bc215a9, 0xd4a60abcf3792b95, 0xf935451de4f21df2, 0xa9538f0419755787, 0xdb9acddff56ca510, 0xd06c98cd5c0975eb, 0xe612a3cb9ecba951, 0xc766e62cfcadaf96, 0xee64435a9752fe72, 0xa192d576b245165a, 0x0a8787bf8ecb74b2, 0x81b3e73d20b49b6f, 0x7fa8220ba3b2ecea, 0x245731c13ca42499, 0xb78dbfaf3a8d83bd, 0xea1ad565322a1a0b, 0x60e61c23a3795013, 0x6606d7e446282b93, 0x6ca4ecb15c5f91e1, 0x9f626da15c9625f3, 0xe51b38608ef25f57, 0x958a324ceb064572] k = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' m = '' for i in range(64): self.assertEqual(siphash(k, m), vectors[i]) m += chr(i)
def next_blob(self, previous_blob=None): if previous_blob: self.read_blobs.append(previous_blob) self.partition = "" self.location = -1 if self.partition and self.location > 0: if self.partition in self.readable_blobs: return self.partition partition_finder = [ blob for blob in self.readable_blobs if siphash("%" * 16, blob) == self.partition ] if len(partition_finder) != 1: raise ValueError( f"Unable to determine current partition ({self.partition})" ) return partition_finder[0] unread = [ blob for blob in self.readable_blobs if blob not in self.read_blobs ] if len(unread) > 0: self.partition = unread[0] self.location = -1 return self.partition return None
def get_cache_key(self, unit, pos): return "check:{}:{}:{}:{}".format( self.check_id, unit.pk, siphash("Weblate Checks", unit.all_flags.format()), pos, )
def calculate_hash(source, context): """Calculate checksum identifying translation.""" if source is not None: data = source.encode('utf-8') + context.encode('utf-8') else: data = context.encode('utf-8') # Need to convert it from unsigned 64-bit int to signed 64-bit int return siphash('Weblate Sip Hash', data) - 2**63
def test_errors(self): with self.assertRaises(ValueError): siphash('not long enough', 'a') with self.assertRaises(ValueError): siphash('toooooooooooooooooooooooo long', 'a') with self.assertRaises(ValueError): siphash('', 'a')
def add(self, position, record): ret_val = [] if record.get(self.column_name): # index lists of items separately values = record[self.column_name] if not isinstance(values, list): values = [values] for value in values: entry = (format(siphash(SEED, f"{value}") % MAX_INDEX, "x"), position) ret_val.append(entry) self.temporary_index += ret_val return ret_val
def search(self, search_term) -> Iterable: """ Search the index for a value. Returns a list of row numbers, if the value is not found, the list is empty. """ if not isinstance(search_term, (list, set, tuple)): search_term = [search_term] result: list = [] for term in search_term: key = format(siphash(SEED, f"{term}") % MAX_INDEX, "x") if key in self._index: # type:ignore result[0:0] = self._index[key] # type:ignore return result
def test_errors(self): """Test error handling.""" with self.assertRaises(ValueError): siphash("not long enough", "a") with self.assertRaises(ValueError): siphash("toooooooooooooooooooooooo long", "a") with self.assertRaises(ValueError): siphash("", "a")
def __getitem__(self, item): from bitarray import bitarray if item == "map": blob_map = bitarray("".join([ "1" if blob in self.read_blobs else "0" for blob in self.readable_blobs ])) return blob_map.tobytes().hex() if item == "partition": return siphash("%" * 16, self.partition) if item == "location": return self.location return None
def pack(fname): global total, compressed f = open(fname) data = f.read() f.close() checksum = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data) fsize = len(data) data = lz4.compress(data) # size of packet(4), checksum(8), fnamelen(2)+fname, uncompressed size(4), compressed data l = len(data) pktlen = 4 + 8 + 2 + len(fname) + 4 + l total += fsize compressed += len(data) sys.stderr.write("%s: %d -> %d\n" %(fname, fsize, len(data))) sys.stdout.write( struct.pack('<IQH%dsI%ds' % (len(fname), l), pktlen, checksum, len(fname), fname,fsize,data)) sys.stdout.flush()
def read_blob(self, blob: str) -> IOBase: """ Read-thru cache """ cache_server = memcached_server() # if cache isn't configured, read and get out of here if not cache_server: result = self.get_blob_bytes(blob) return io.BytesIO(result) # hash the blob name for the look up from siphashc import siphash blob_hash = str(siphash("RevengeOfTheBlob", blob)) # try to fetch the cached file result = cache_server.get(blob_hash) # if the item was a miss, get it from storage and add it to the cache if result is None: result = self.get_blob_bytes(blob) cache_server.set(blob_hash, result) return io.BytesIO(result)
"ADDDAYS": add_days, "DAYSDIFF": diff_days, # STRINGS "UCASE": lambda x: str(x).upper(), "UPPER": lambda x: str(x).upper(), "LCASE": lambda x: str(x).lower(), "LOWER": lambda x: str(x).lower(), "TRIM": lambda x: str(x).strip(), "LEN": len, "STRING": to_string, "LEFT": lambda x, y: str(x)[: int(y)], "RIGHT": lambda x, y: str(x)[-int(y) :], "MID": lambda x, y, z: str(x)[int(y) :][: int(z)], "CONCAT": concat, # NUMBERS "ROUND": round, "TRUNC": parse_number(float, truncate), "INTEGER": parse_number(float, int), "DOUBLE": parse_number(float, float), # BOOLEAN "BOOLEAN": lambda x: str(x).upper() != "FALSE", "ISNONE": lambda x: x is None, # HASHING & ENCODING "HASH": lambda x: format(siphash("INCOMPREHENSIBLE", str(x)), "X"), "MD5": get_md5, "RANDOM": get_random, # return a random number 0-99 # OTHER "BETWEEN": lambda val, low, high: low < val < high, "SORT": lambda x: sorted(x), }
def test_reference_vectors(self): """Test reference vectors.""" vectors = [ 0x726FDB47DD0E0E31, 0x74F839C593DC67FD, 0x0D6C8009D9A94F5A, 0x85676696D7FB7E2D, 0xCF2794E0277187B7, 0x18765564CD99A68D, 0xCBC9466E58FEE3CE, 0xAB0200F58B01D137, 0x93F5F5799A932462, 0x9E0082DF0BA9E4B0, 0x7A5DBBC594DDB9F3, 0xF4B32F46226BADA7, 0x751E8FBC860EE5FB, 0x14EA5627C0843D90, 0xF723CA908E7AF2EE, 0xA129CA6149BE45E5, 0x3F2ACC7F57C29BDB, 0x699AE9F52CBE4794, 0x4BC1B3F0968DD39C, 0xBB6DC91DA77961BD, 0xBED65CF21AA2EE98, 0xD0F2CBB02E3B67C7, 0x93536795E3A33E88, 0xA80C038CCD5CCEC8, 0xB8AD50C6F649AF94, 0xBCE192DE8A85B8EA, 0x17D835B85BBB15F3, 0x2F2E6163076BCFAD, 0xDE4DAAACA71DC9A5, 0xA6A2506687956571, 0xAD87A3535C49EF28, 0x32D892FAD841C342, 0x7127512F72F27CCE, 0xA7F32346F95978E3, 0x12E0B01ABB051238, 0x15E034D40FA197AE, 0x314DFFBE0815A3B4, 0x027990F029623981, 0xCADCD4E59EF40C4D, 0x9ABFD8766A33735C, 0x0E3EA96B5304A7D0, 0xAD0C42D6FC585992, 0x187306C89BC215A9, 0xD4A60ABCF3792B95, 0xF935451DE4F21DF2, 0xA9538F0419755787, 0xDB9ACDDFF56CA510, 0xD06C98CD5C0975EB, 0xE612A3CB9ECBA951, 0xC766E62CFCADAF96, 0xEE64435A9752FE72, 0xA192D576B245165A, 0x0A8787BF8ECB74B2, 0x81B3E73D20B49B6F, 0x7FA8220BA3B2ECEA, 0x245731C13CA42499, 0xB78DBFAF3A8D83BD, 0xEA1AD565322A1A0B, 0x60E61C23A3795013, 0x6606D7E446282B93, 0x6CA4ECB15C5F91E1, 0x9F626DA15C9625F3, 0xE51B38608EF25F57, 0x958A324CEB064572, ] k = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" message = "" for i in range(64): self.assertEqual(siphash(k, message), vectors[i]) message += chr(i)
def get_hash(string, queue_count=const.AMQ.NUM_QUEUES): """Generates a hash for given string""" # Only use the last 31 bits of the 64-bit hash because of serious # PHP-retardedness hash32 = siphashc.siphash(const.AMQ.HASH, string) & 0x7FFFFFFF return hash32 % queue_count
def gpg_cache_key(suffix: str) -> str: return "gpg:{}:{}".format( siphash("Weblate GPG hash", settings.WEBLATE_GPG_IDENTITY), suffix )
def hash_text(name): """Hash text for use in HTML id.""" return hash_to_checksum(siphash("Weblate URL hash", name.encode()))
def test_hash(self): result = siphash('sixteencharstrng', 'i need a hash of this') self.assertEqual(expected_hash1, result) result = siphash('0123456789ABCDEF', 'a') self.assertEqual(expected_hash2, result)
def raw_hash(*parts: str): """Calculates checksum identifying translation.""" data = "".join(part for part in parts) return siphash("Weblate Sip Hash", data)
def sip(val): return siphash("TheApolloMission", val)