Exemplo n.º 1
0
    def test_hash(self):
        """Test simple hashing."""
        result = siphash("sixteencharstrng", "i need a hash of this")
        self.assertEqual(10796923698683394048, result)

        result = siphash("0123456789ABCDEF", "a")
        self.assertEqual(12398370950267227270, result)
Exemplo n.º 2
0
    def _map(self, collect_columns):
        """
        Create Tuples of records in the Groups (GroupID, CollectedColumn, Value)

        The GroupID is a hash of the grouped columns, we do this because we don't actually
        care about the column values, just that we can uniquely identify records with
        the same values.

        For each column we're collecting, we emit a record of the column and the value
        in the column.

        This is akin to the MAP step in a MapReduce algo, we're creating a set of values
        which standardize the format of the data to be processed and could allow the
        data to be processed in parallel.
        """
        if collect_columns == self._columns == {"*"}:
            # if we're doing COUNT(*), short-cut the processing
            self._group_keys["*"] = [("*", "*")]
            for record in self._dictset:
                yield ("*", "*", "*")
            return

        for record in self._dictset:
            try:
                group_key: cython.uint64_t = siphash(
                    HASH_SEED,
                    "".join([str(record[column]) for column in self._columns]),
                )
            except KeyError:
                group_key: cython.uint64_t = siphash(
                    HASH_SEED,
                    "".join([
                        f"{record.get(column, '')}" for column in self._columns
                    ]),
                )
            if group_key not in self._group_keys.keys():
                self._group_keys[group_key] = [(column, record.get(column))
                                               for column in self._columns]
                if len(self._group_keys) >= 4999999:
                    raise TooManyGroups(
                        f"Groups are not selective enough and too many Groups have been found (stopped at {len(self._group_keys)})."
                    )

            for column in collect_columns:
                if column == "*":
                    yield (group_key, column, "*")
                else:
                    v = record.get(column)  # ignore nulls
                    if v is not None:
                        yield (group_key, column, record[column])
Exemplo n.º 3
0
def unpack(stream):

    while True:
        data = stream.read(4)
        if len(data) != 4:
            break
        (pktSize, ) = struct.unpack('<I', data)
        data = stream.read(8 + 2)
        if len(data) != 8 + 2:
            sys.stderr.write('short read')
            break
        (checksum, lenfname) = struct.unpack('<QH', data)
        fname = stream.read(lenfname)
        if len(fname) != lenfname:
            sys.stderr.write('short read')
            break
        data = stream.read(4)
        if len(data) != 4:
            sys.stderr.write('short read')
            break
        (fsize, ) = struct.unpack('<I', data)
        compressedSize = pktSize - 4 - 8 - 2 - 4 - lenfname
        data = stream.read(compressedSize)
        if len(data) != compressedSize:
            sys.stderr.write('short read')
            break

        data = lz4.uncompress(data)
        got = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data)
        if got == checksum:
            sys.stderr.write('%s: %d -> %d\n' % (fname, compressedSize, fsize))
        else:
            sys.stderr.write('%s: checksum fail: got %d, want %d\n' %
                             (fname, got, checksum))
Exemplo n.º 4
0
    def load_cursor(self, cursor):
        from bitarray import bitarray

        if cursor is None:
            return

        if isinstance(cursor, str):
            cursor = orjson.loads(cursor)

        if (not "location" in cursor.keys() or not "map" in cursor.keys()
                or not "partition" in cursor.keys()):
            raise InvalidCursor(f"Cursor is malformed or corrupted {cursor}")

        self.location = cursor["location"]
        find_partition = [
            blob for blob in self.readable_blobs
            if siphash("%" * 16, blob) == cursor["partition"]
        ]
        if len(find_partition) == 1:
            self.partition = find_partition[0]
        map_bytes = bytes.fromhex(cursor["map"])
        blob_map = bitarray()
        blob_map.frombytes(map_bytes)
        self.read_blobs = [
            self.readable_blobs[i] for i in range(len(self.readable_blobs))
            if blob_map[i]
        ]
Exemplo n.º 5
0
def unpack(stream):

    while True:
        data = stream.read(4)
        if len(data) != 4:
            break
        (pktSize,) = struct.unpack('<I', data)
        data = stream.read(8+2)
        if len(data) != 8+2:
            sys.stderr.write('short read')
            break
        (checksum, lenfname) = struct.unpack('<QH', data)
        fname = stream.read(lenfname)
        if len(fname) != lenfname:
            sys.stderr.write('short read')
            break
        data = stream.read(4)
        if len(data) != 4:
            sys.stderr.write('short read')
            break
        (fsize,) = struct.unpack('<I', data)
        compressedSize = pktSize - 4 - 8 - 2 - 4 - lenfname
        data = stream.read(compressedSize)
        if len(data) != compressedSize:
            sys.stderr.write('short read')
            break

        data = lz4.uncompress(data)
        got = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data)
        if got == checksum:
            sys.stderr.write('%s: %d -> %d\n' % (fname, compressedSize, fsize))
        else:
            sys.stderr.write('%s: checksum fail: got %d, want %d\n' % (fname, got, checksum))
Exemplo n.º 6
0
 def test_reference_vectors(self):
     vectors = [
       0x726fdb47dd0e0e31, 0x74f839c593dc67fd, 0x0d6c8009d9a94f5a,
       0x85676696d7fb7e2d, 0xcf2794e0277187b7, 0x18765564cd99a68d,
       0xcbc9466e58fee3ce, 0xab0200f58b01d137, 0x93f5f5799a932462,
       0x9e0082df0ba9e4b0, 0x7a5dbbc594ddb9f3, 0xf4b32f46226bada7,
       0x751e8fbc860ee5fb, 0x14ea5627c0843d90, 0xf723ca908e7af2ee,
       0xa129ca6149be45e5, 0x3f2acc7f57c29bdb, 0x699ae9f52cbe4794,
       0x4bc1b3f0968dd39c, 0xbb6dc91da77961bd, 0xbed65cf21aa2ee98,
       0xd0f2cbb02e3b67c7, 0x93536795e3a33e88, 0xa80c038ccd5ccec8,
       0xb8ad50c6f649af94, 0xbce192de8a85b8ea, 0x17d835b85bbb15f3,
       0x2f2e6163076bcfad, 0xde4daaaca71dc9a5, 0xa6a2506687956571,
       0xad87a3535c49ef28, 0x32d892fad841c342, 0x7127512f72f27cce,
       0xa7f32346f95978e3, 0x12e0b01abb051238, 0x15e034d40fa197ae,
       0x314dffbe0815a3b4, 0x027990f029623981, 0xcadcd4e59ef40c4d,
       0x9abfd8766a33735c, 0x0e3ea96b5304a7d0, 0xad0c42d6fc585992,
       0x187306c89bc215a9, 0xd4a60abcf3792b95, 0xf935451de4f21df2,
       0xa9538f0419755787, 0xdb9acddff56ca510, 0xd06c98cd5c0975eb,
       0xe612a3cb9ecba951, 0xc766e62cfcadaf96, 0xee64435a9752fe72,
       0xa192d576b245165a, 0x0a8787bf8ecb74b2, 0x81b3e73d20b49b6f,
       0x7fa8220ba3b2ecea, 0x245731c13ca42499, 0xb78dbfaf3a8d83bd,
       0xea1ad565322a1a0b, 0x60e61c23a3795013, 0x6606d7e446282b93,
       0x6ca4ecb15c5f91e1, 0x9f626da15c9625f3, 0xe51b38608ef25f57,
       0x958a324ceb064572]
     k = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
     m = ''
     for i in range(64):
         self.assertEqual(siphash(k, m), vectors[i])
         m += chr(i)
Exemplo n.º 7
0
 def next_blob(self, previous_blob=None):
     if previous_blob:
         self.read_blobs.append(previous_blob)
         self.partition = ""
         self.location = -1
     if self.partition and self.location > 0:
         if self.partition in self.readable_blobs:
             return self.partition
         partition_finder = [
             blob for blob in self.readable_blobs
             if siphash("%" * 16, blob) == self.partition
         ]
         if len(partition_finder) != 1:
             raise ValueError(
                 f"Unable to determine current partition ({self.partition})"
             )
         return partition_finder[0]
     unread = [
         blob for blob in self.readable_blobs if blob not in self.read_blobs
     ]
     if len(unread) > 0:
         self.partition = unread[0]
         self.location = -1
         return self.partition
     return None
Exemplo n.º 8
0
 def get_cache_key(self, unit, pos):
     return "check:{}:{}:{}:{}".format(
         self.check_id,
         unit.pk,
         siphash("Weblate   Checks", unit.all_flags.format()),
         pos,
     )
Exemplo n.º 9
0
def calculate_hash(source, context):
    """Calculate checksum identifying translation."""
    if source is not None:
        data = source.encode('utf-8') + context.encode('utf-8')
    else:
        data = context.encode('utf-8')
    # Need to convert it from unsigned 64-bit int to signed 64-bit int
    return siphash('Weblate Sip Hash', data) - 2**63
Exemplo n.º 10
0
 def test_errors(self):
     with self.assertRaises(ValueError):
         siphash('not long enough', 'a')
     with self.assertRaises(ValueError):
         siphash('toooooooooooooooooooooooo long', 'a')
     with self.assertRaises(ValueError):
         siphash('', 'a')
Exemplo n.º 11
0
 def add(self, position, record):
     ret_val = []
     if record.get(self.column_name):
         # index lists of items separately
         values = record[self.column_name]
         if not isinstance(values, list):
             values = [values]
         for value in values:
             entry = (format(siphash(SEED, f"{value}") % MAX_INDEX,
                             "x"), position)
             ret_val.append(entry)
     self.temporary_index += ret_val
     return ret_val
Exemplo n.º 12
0
 def search(self, search_term) -> Iterable:
     """
     Search the index for a value. Returns a list of row numbers, if the value is
     not found, the list is empty.
     """
     if not isinstance(search_term, (list, set, tuple)):
         search_term = [search_term]
     result: list = []
     for term in search_term:
         key = format(siphash(SEED, f"{term}") % MAX_INDEX, "x")
         if key in self._index:  # type:ignore
             result[0:0] = self._index[key]  # type:ignore
     return result
Exemplo n.º 13
0
 def test_errors(self):
     """Test error handling."""
     with self.assertRaises(ValueError):
         siphash("not long enough", "a")
     with self.assertRaises(ValueError):
         siphash("toooooooooooooooooooooooo long", "a")
     with self.assertRaises(ValueError):
         siphash("", "a")
Exemplo n.º 14
0
    def __getitem__(self, item):
        from bitarray import bitarray

        if item == "map":
            blob_map = bitarray("".join([
                "1" if blob in self.read_blobs else "0"
                for blob in self.readable_blobs
            ]))
            return blob_map.tobytes().hex()
        if item == "partition":
            return siphash("%" * 16, self.partition)
        if item == "location":
            return self.location
        return None
Exemplo n.º 15
0
def pack(fname):

    global total, compressed


    f = open(fname)
    data = f.read()
    f.close()
    checksum = siphashc.siphash('\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', data)
    fsize = len(data)
    data = lz4.compress(data)

    # size of packet(4), checksum(8), fnamelen(2)+fname, uncompressed size(4), compressed data
    l = len(data)
    pktlen = 4 + 8 + 2 + len(fname) + 4 + l

    total += fsize
    compressed += len(data)

    sys.stderr.write("%s: %d -> %d\n"  %(fname, fsize, len(data)))
    sys.stdout.write( struct.pack('<IQH%dsI%ds' % (len(fname), l), pktlen, checksum, len(fname), fname,fsize,data))
    sys.stdout.flush()
Exemplo n.º 16
0
    def read_blob(self, blob: str) -> IOBase:
        """
        Read-thru cache
        """
        cache_server = memcached_server()
        # if cache isn't configured, read and get out of here
        if not cache_server:
            result = self.get_blob_bytes(blob)
            return io.BytesIO(result)

        # hash the blob name for the look up
        from siphashc import siphash

        blob_hash = str(siphash("RevengeOfTheBlob", blob))

        # try to fetch the cached file
        result = cache_server.get(blob_hash)

        # if the item was a miss, get it from storage and add it to the cache
        if result is None:
            result = self.get_blob_bytes(blob)
            cache_server.set(blob_hash, result)

        return io.BytesIO(result)
Exemplo n.º 17
0
    "ADDDAYS": add_days,
    "DAYSDIFF": diff_days,
    # STRINGS
    "UCASE": lambda x: str(x).upper(),
    "UPPER": lambda x: str(x).upper(),
    "LCASE": lambda x: str(x).lower(),
    "LOWER": lambda x: str(x).lower(),
    "TRIM": lambda x: str(x).strip(),
    "LEN": len,
    "STRING": to_string,
    "LEFT": lambda x, y: str(x)[: int(y)],
    "RIGHT": lambda x, y: str(x)[-int(y) :],
    "MID": lambda x, y, z: str(x)[int(y) :][: int(z)],
    "CONCAT": concat,
    # NUMBERS
    "ROUND": round,
    "TRUNC": parse_number(float, truncate),
    "INTEGER": parse_number(float, int),
    "DOUBLE": parse_number(float, float),
    # BOOLEAN
    "BOOLEAN": lambda x: str(x).upper() != "FALSE",
    "ISNONE": lambda x: x is None,
    # HASHING & ENCODING
    "HASH": lambda x: format(siphash("INCOMPREHENSIBLE", str(x)), "X"),
    "MD5": get_md5,
    "RANDOM": get_random,  # return a random number 0-99
    # OTHER
    "BETWEEN": lambda val, low, high: low < val < high,
    "SORT": lambda x: sorted(x),
}
Exemplo n.º 18
0
 def test_reference_vectors(self):
     """Test reference vectors."""
     vectors = [
         0x726FDB47DD0E0E31,
         0x74F839C593DC67FD,
         0x0D6C8009D9A94F5A,
         0x85676696D7FB7E2D,
         0xCF2794E0277187B7,
         0x18765564CD99A68D,
         0xCBC9466E58FEE3CE,
         0xAB0200F58B01D137,
         0x93F5F5799A932462,
         0x9E0082DF0BA9E4B0,
         0x7A5DBBC594DDB9F3,
         0xF4B32F46226BADA7,
         0x751E8FBC860EE5FB,
         0x14EA5627C0843D90,
         0xF723CA908E7AF2EE,
         0xA129CA6149BE45E5,
         0x3F2ACC7F57C29BDB,
         0x699AE9F52CBE4794,
         0x4BC1B3F0968DD39C,
         0xBB6DC91DA77961BD,
         0xBED65CF21AA2EE98,
         0xD0F2CBB02E3B67C7,
         0x93536795E3A33E88,
         0xA80C038CCD5CCEC8,
         0xB8AD50C6F649AF94,
         0xBCE192DE8A85B8EA,
         0x17D835B85BBB15F3,
         0x2F2E6163076BCFAD,
         0xDE4DAAACA71DC9A5,
         0xA6A2506687956571,
         0xAD87A3535C49EF28,
         0x32D892FAD841C342,
         0x7127512F72F27CCE,
         0xA7F32346F95978E3,
         0x12E0B01ABB051238,
         0x15E034D40FA197AE,
         0x314DFFBE0815A3B4,
         0x027990F029623981,
         0xCADCD4E59EF40C4D,
         0x9ABFD8766A33735C,
         0x0E3EA96B5304A7D0,
         0xAD0C42D6FC585992,
         0x187306C89BC215A9,
         0xD4A60ABCF3792B95,
         0xF935451DE4F21DF2,
         0xA9538F0419755787,
         0xDB9ACDDFF56CA510,
         0xD06C98CD5C0975EB,
         0xE612A3CB9ECBA951,
         0xC766E62CFCADAF96,
         0xEE64435A9752FE72,
         0xA192D576B245165A,
         0x0A8787BF8ECB74B2,
         0x81B3E73D20B49B6F,
         0x7FA8220BA3B2ECEA,
         0x245731C13CA42499,
         0xB78DBFAF3A8D83BD,
         0xEA1AD565322A1A0B,
         0x60E61C23A3795013,
         0x6606D7E446282B93,
         0x6CA4ECB15C5F91E1,
         0x9F626DA15C9625F3,
         0xE51B38608EF25F57,
         0x958A324CEB064572,
     ]
     k = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
     message = ""
     for i in range(64):
         self.assertEqual(siphash(k, message), vectors[i])
         message += chr(i)
Exemplo n.º 19
0
def get_hash(string, queue_count=const.AMQ.NUM_QUEUES):
    """Generates a hash for given string"""
    # Only use the last 31 bits of the 64-bit hash because of serious
    # PHP-retardedness
    hash32 = siphashc.siphash(const.AMQ.HASH, string) & 0x7FFFFFFF
    return hash32 % queue_count
Exemplo n.º 20
0
def gpg_cache_key(suffix: str) -> str:
    return "gpg:{}:{}".format(
        siphash("Weblate GPG hash", settings.WEBLATE_GPG_IDENTITY), suffix
    )
Exemplo n.º 21
0
def hash_text(name):
    """Hash text for use in HTML id."""
    return hash_to_checksum(siphash("Weblate URL hash", name.encode()))
Exemplo n.º 22
0
    def test_hash(self):
        result = siphash('sixteencharstrng', 'i need a hash of this')
        self.assertEqual(expected_hash1, result)

        result = siphash('0123456789ABCDEF', 'a')
        self.assertEqual(expected_hash2, result)
Exemplo n.º 23
0
def raw_hash(*parts: str):
    """Calculates checksum identifying translation."""
    data = "".join(part for part in parts)
    return siphash("Weblate Sip Hash", data)
Exemplo n.º 24
0
 def sip(val):
     return siphash("TheApolloMission", val)