def __init__(self, index, header, records): self.index = index self.header = header self.term = None self.records = records header_crc_bytes = struct.pack( "<" + HDR_FMT_RP_PREFIX_NO_CRC + HDR_FMT_CRC, *self.header[1:]) header_crc = crc32c.crc32c(header_crc_bytes) if self.header.header_crc != header_crc: raise CorruptBatchError(self) crc = crc32c.crc32c(self._crc_header_be_bytes()) crc = crc32c.crc32c(records, crc) if self.header.crc != crc: raise CorruptBatchError(self)
def getindex(self, i): index = self.index() N = index.shape[0] offset = index[i, 1] if i < N - 1: next_offset = index[i + 1, 1] value = self.buffer[offset:next_offset] else: value = self.buffer[offset:] if self.format_version == 1: stored_check_value = int.from_bytes(value[-4:], byteorder='little') value = value[:-4] if self.check_crc: retrieved_check_value = crc32c.crc32c(value) if retrieved_check_value != stored_check_value: raise ValidationError( f"Label {i} failed its crc32c check. Stored: {stored_check_value} Computed: {retrieved_check_value}" ) encoding = self.compress if encoding: value = compression.decompress(value, encoding, str(index[i, 0])) if self.frombytesfn: value = self.frombytesfn(value) return value
def md5md5crc32c(path): # https://github.com/colinmarc/hdfs/blob/f2f512db170db82ad41590c4ba3b7718b13317d2/file_reader.go#L76 import hashlib from crc32c import crc32c # pylint: disable=no-name-in-module # dfs.bytes-per-checksum = 512, default on hadoop 2.7 bytes_per_checksum = 512 padded = 32 total = 0 md5md5 = hashlib.md5() with open(path, "rb") as fobj: while True: block = fobj.read(bytes_per_checksum) if not block: break crc_int = crc32c(block) # NOTE: hdfs is big-endian crc_bytes = crc_int.to_bytes((crc_int.bit_length() + 7) // 8, "big") md5 = hashlib.md5(crc_bytes).digest() total += len(md5) if padded < total: padded *= 2 md5md5.update(md5) md5md5.update(b"\0" * (padded - total)) return "000002000000000000000000" + md5md5.hexdigest()
def _test_runGraphOneDOPerDOM(self, repeats=1): g1 = [memory("A")] g2 = [ {"oid": "B", "type": "app", "app": "dlg.apps.crc.CRCApp"}, memory("C", producers=["B"]), ] rels = [DROPRel("B", DROPLinkType.CONSUMER, "A")] a_data = os.urandom(32) c_data = str(crc32c(a_data, 0)).encode("utf8") node_managers = [self._start_dm(threads=self.nm_threads) for _ in range(2)] ids = [0] * repeats for n in range(repeats): choice = 0 while choice in ids: choice = random.randint(0, 1000) ids[n] = choice sessionId = f"s{choice}" self._test_runGraphInTwoNMs( copy.deepcopy(g1), copy.deepcopy(g2), rels, a_data, c_data, sessionId=sessionId, node_managers=node_managers, )
def test_run_streaming_consumer_remotely2(self): """ Like above, but C is hostd by DM #2. """ g1 = [ memory("A"), { "oid": "B", "type": "app", "app": "dlg.apps.simple.CopyApp", "inputs": ["A"], }, ] g2 = [ memory("C"), { "oid": "D", "type": "app", "app": "dlg.apps.crc.CRCStreamApp", "streamingInputs": ["C"], "outputs": ["E"], }, memory("E"), ] rels = [DROPRel("C", DROPLinkType.OUTPUT, "B")] a_data = os.urandom(32) e_data = str(crc32c(a_data, 0)).encode("utf8") self._test_runGraphInTwoNMs(g1, g2, rels, a_data, e_data, leaf_oid="E")
def _test_dynamic_write_withDropType(self, dropType): """ Test an AbstractDROP and a simple AppDROP (for checksum calculation) without an expected drop size (for app compatibility and not recommended in production) """ # NOTE: use_staging required for multiple writes to plasma drops a = dropType("oid:A", "uid:A", expectedSize=-1, use_staging=True) b = SumupContainerChecksum("oid:B", "uid:B") c = InMemoryDROP("oid:C", "uid:C") b.addInput(a) b.addOutput(c) test_crc = 0 with DROPWaiterCtx(self, c): for _ in range(self._test_num_blocks): a.write(self._test_block) test_crc = crc32c(self._test_block, test_crc) a.setCompleted() # Read the checksum from c cChecksum = int(droputils.allDropContents(c)) self.assertNotEqual(a.checksum, 0) self.assertEqual(a.checksum, test_crc) self.assertEqual(cChecksum, test_crc)
def bep42_prefix( ip, crc32_salt, first_node_bits): # first_node_bits determines the last 3 bits from crc32c import crc32c ip_asint = decode_uint32(encode_ip(ip)) value = crc32c( bytearray( encode_uint32((ip_asint & 0x030f3fff) | ((crc32_salt & 0x7) << 29)))) return (value & 0xfffff800) | ((first_node_bits << 8) & 0x00000700)
def crc32c_file_checksum(filepath, ftype): """ Calculates the CRC32C checksum of a file locally :param ftype: 'dir' or 'file' :param filepath: local absolute filepath :return: """ if ftype == 'dir': return None buf = open(filepath, 'rb').read() ret = "%08x" % (crc32c.crc32c(buf) & 0xFFFFFFFF) return ret
def get_source_hashes_CRC32C(what): blocksize = 128 * 256 for i in what: with Path(i[0]).open("rb") as file: crcvalue = 0 while True: buf = file.read(blocksize) if not buf: break crcvalue = (crc32c(buf, crcvalue) & 0xffffffff) hashstring = f'{crcvalue:x}' i[6] = hashstring
def _read_png_section(self, f): section_length = int.from_bytes(f.read(4), byteorder='big',signed=False) section_type = f.read(4) if section_type == b'': raise EOF() section_content = f.read(section_length) section_crc=f.read(4) if crc32c(section_content) != section_crc: # print(int(section_crc.hex(),16)) # print(crc32c(section_content)) # raise InvalidCRCError(f"{self._source_file_path} section {section_type.decode('latin-1')} has invalid CRC -> data is probably corrupted") pass return (section_type.decode('latin-1'), section_content)
def generate_flash(app_eui: int, dev_eui: int, app_key: bytes, board_id: int, board_version: int) -> bytes: flash_before_crc = EepromContents( crc=0, board_id=board_id, board_version=board_version, app_eui=app_eui, dev_eui=dev_eui, app_key=app_key, ) # Calculate CRC over all but the CRC bytes binary_before_crc = struct.pack(BLOCK_FORMAT, *flash_before_crc) flash = flash_before_crc._replace(crc=crc32c.crc32c( binary_before_crc[CRC_SIZE:]), ) # And pack again with the right CRC set return struct.pack(BLOCK_FORMAT, *flash)
def dict2buf(self, data, compress=None, tobytesfn=None): """Structure [ index length, sorted index, data ]""" labels = np.array([int(lbl) for lbl in data.keys()], dtype=self.dtype) labels.sort() out = np.zeros((len(labels), ), dtype=np.uint64) eytzinger_sort(labels, out) labels = out N = len(labels) N_region = N.to_bytes(4, byteorder="little", signed=False) compress = compression.normalize_encoding(compress) compress_header = nvl(compress, "none") header = (MAGIC_NUMBERS + bytes([FORMAT_VERSION]) + compress_header.zfill(4).encode("ascii") + N_region) if N == 0: return header index_length = 2 * N index = np.zeros((index_length, ), dtype=self.dtype) index[::2] = labels noop = lambda x: x tobytesfn = nvl(tobytesfn, self.tobytesfn, noop) bytes_data = { label: compression.compress(tobytesfn(val), method=compress) for label, val in data.items() } for label in bytes_data: bytes_data[label] += crc32c.crc32c(bytes_data[label]).to_bytes( 4, byteorder='little') data_region = b"".join((bytes_data[label] for label in labels)) index[1] = HEADER_LENGTH + index_length * 8 for i, label in zip(range(1, len(labels)), labels): index[i * 2 + 1] = index[(i - 1) * 2 + 1] + len(bytes_data[labels[i - 1]]) return b"".join([header, index.tobytes(), data_region])
def calc_checksum(update: Message) -> int: # BOLT #7: The checksum of a `channel_update` is the CRC32C checksum as # specified in [RFC3720](https://tools.ietf.org/html/rfc3720#appendix-B.4) # of this `channel_update` without its `signature` and `timestamp` fields. bufio = io.BytesIO() update.write(bufio) buf = bufio.getvalue() # BOLT #7: # 1. type: 258 (`channel_update`) # 2. data: # * [`signature`:`signature`] # * [`chain_hash`:`chain_hash`] # * [`short_channel_id`:`short_channel_id`] # * [`u32`:`timestamp`] # * [`byte`:`message_flags`] # Note: 2 bytes for `type` field return crc32c.crc32c(buf[2 + 64:2 + 64 + 32 + 8] + buf[2 + 64 + 32 + 8 + 4:])
def put(self, source, target): last_ex = None for _repeat in range(6): try: key = self.handle.blob(target, chunk_size=self.CHUNK_SIZE) with open(source, "rb") as blob_file: crc32 = crc32c.crc32c(blob_file.read()) key.crc32c = self.crc32c_hash_b64encode(crc32) key.upload_from_filename(source) break except (IOError, BadStatusLine, exceptions.GCloudError, exceptions.BadRequest) as ex: sleep(_repeat * 2 + 1) self._reconnect(self.name) last_ex = ex except Exception as ex: last_ex = ex else: raise Exception( "Object {} cannot put into the bucket {}: {}!".format( source, self.handle.id, str(last_ex)))
def _test_write_withDropType(self, dropType): """ Test an AbstractDROP and a simple AppDROP (for checksum calculation) """ a = dropType("oid:A", "uid:A", expectedSize=self._test_drop_sz * ONE_MB) b = SumupContainerChecksum("oid:B", "uid:B") c = InMemoryDROP("oid:C", "uid:C") b.addInput(a) b.addOutput(c) test_crc = 0 with DROPWaiterCtx(self, c): for _ in range(self._test_num_blocks): a.write(self._test_block) test_crc = crc32c(self._test_block, test_crc) # Read the checksum from c cChecksum = int(droputils.allDropContents(c)) self.assertNotEqual(a.checksum, 0) self.assertEqual(a.checksum, test_crc) self.assertEqual(cChecksum, test_crc)
def test_run_streaming_consumer_remotely(self): """ A test that checks that a streaming consumer works correctly across node managers when its input is in a different node, like this: DM #1 DM #2 ================== ============== | A --> B --> C -|----|--> D --> E | ================== ============== Here B is anormal application and D is a streaming consumer of C. We use A and E to compare that all data flows correctly. """ g1 = [ memory("A"), { "oid": "B", "type": "app", "app": "dlg.apps.simple.CopyApp", "inputs": ["A"], "outputs": ["C"], }, memory("C"), ] g2 = [ { "oid": "D", "type": "app", "app": "dlg.apps.crc.CRCStreamApp", "outputs": ["E"], }, memory("E"), ] rels = [DROPRel("C", DROPLinkType.STREAMING_INPUT, "D")] a_data = os.urandom(32) e_data = str(crc32c(a_data, 0)).encode("utf8") self._test_runGraphInTwoNMs(g1, g2, rels, a_data, e_data, leaf_oid="E")
def _test_socket_listener(self, **kwargs): """ A simple test to check that SocketListenerApps are indeed working as expected; that is, they write the data they receive into their output, and finish when the connection is closed from the client side The data flow diagram looks like this: A --> B --> C --> D """ host = "127.0.0.1" port = 9933 data = os.urandom(1025) a = SocketListenerApp("oid:A", "uid:A", host=host, port=port, **kwargs) b = InMemoryDROP("oid:B", "uid:B") c = SumupContainerChecksum("oid:C", "uid:C") d = InMemoryDROP("oid:D", "uid:D") a.addOutput(b) b.addConsumer(c) c.addOutput(d) # Create the socket, write, and close the connection, allowing # A to move to COMPLETED with DROPWaiterCtx(self, d, 3): # That's plenty of time a.async_execute() utils.write_to(host, port, data, 1) for drop in [a, b, c, d]: self.assertEqual(DROPStates.COMPLETED, drop.status) # Our expectations are fulfilled! bContents = droputils.allDropContents(b) dContents = int(droputils.allDropContents(d)) self.assertEqual(data, bContents) self.assertEqual(crc32c(data, 0), dContents)
def bep42_prefix(ip, crc32_salt, first_node_bits): # first_node_bits determines the last 3 bits from crc32c import crc32c ip_asint = decode_uint32(encode_ip(ip)) value = crc32c(bytearray(encode_uint32((ip_asint & 0x030f3fff) | ((crc32_salt & 0x7) << 29)))) return (value & 0xfffff800) | ((first_node_bits << 8) & 0x00000700)
def masked_crc32c(data): x = u32(crc32c(data)) return u32(((x >> 15) | u32(x << 17)) + 0xa282ead8)
def _masked_crc32c(data): x = _u32(crc32c(data)) return _u32(((x >> 15) | _u32(x << 17)) + 0xa282ead8)
def init(cls, request, metadata, media, bucket, is_destination, context): if context is None: instruction = request.headers.get("x-goog-testbench-instructions") if instruction == "inject-upload-data-error": media = utils.common.corrupt_media(media) timestamp = datetime.datetime.now(datetime.timezone.utc) metadata.bucket = bucket.name metadata.generation = random.getrandbits(63) metadata.metageneration = 1 metadata.id = "%s/o/%s#%d" % ( metadata.bucket, metadata.name, metadata.generation, ) metadata.size = len(media) actual_md5Hash = base64.b64encode( hashlib.md5(media).digest()).decode("utf-8") if metadata.md5_hash != "" and actual_md5Hash != metadata.md5_hash: utils.error.mismatch("md5Hash", metadata.md5_hash, actual_md5Hash, context) actual_crc32c = crc32c.crc32c(media) if metadata.HasField( "crc32c") and actual_crc32c != metadata.crc32c.value: utils.error.mismatch("crc32c", metadata.crc32c.value, actual_crc32c, context) metadata.md5_hash = actual_md5Hash metadata.crc32c.value = actual_crc32c metadata.time_created.FromDatetime(timestamp) metadata.updated.FromDatetime(timestamp) metadata.owner.entity = utils.acl.get_object_entity("OWNER", context) metadata.owner.entity_id = hashlib.md5( metadata.owner.entity.encode("utf-8")).hexdigest() algorithm, key_b64, key_sha256_b64 = utils.csek.extract( request, False, context) if algorithm != "": utils.csek.check(algorithm, key_b64, key_sha256_b64, context) metadata.customer_encryption.encryption_algorithm = algorithm metadata.customer_encryption.key_sha256 = key_sha256_b64 default_projection = CommonEnums.Projection.NO_ACL is_uniform = bucket.iam_configuration.uniform_bucket_level_access.enabled bucket.iam_configuration.uniform_bucket_level_access.enabled = False if len(metadata.acl) != 0: default_projection = CommonEnums.Projection.FULL else: predefined_acl = utils.acl.extract_predefined_acl( request, is_destination, context) if (predefined_acl == CommonEnums.PredefinedObjectAcl. PREDEFINED_OBJECT_ACL_UNSPECIFIED): predefined_acl = ( CommonEnums.PredefinedObjectAcl.OBJECT_ACL_PROJECT_PRIVATE) elif predefined_acl == "": predefined_acl = "projectPrivate" elif is_uniform: utils.error.invalid( "Predefined ACL with uniform bucket level access enabled", context) cls.__insert_predefined_acl(metadata, bucket, predefined_acl, context) bucket.iam_configuration.uniform_bucket_level_access.enabled = is_uniform return ( cls(metadata, media, bucket), utils.common.extract_projection(request, default_projection, context), )
#!/usr/bin/env python2 import crc32c import struct STAGE1_SECTORS = 28 with open("build/stage1/stage1.bin", "r") as f: data = f.read() padded_size = STAGE1_SECTORS * 512 - 4 assert len(data) <= padded_size data += chr(0) * (padded_size - len(data)) data += struct.pack("<I", crc32c.crc32c(data)) with open("build/stage1.bin", "w") as f: f.write(data)
def get_modulo_value(experiment, user_id): # type: (str, Union[str, int]) -> int return crc32c(str(user_id).encode(), crc32c(experiment.encode())) % 100
def masked_crc32c(data): # mast function defined in: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/lib/hash/crc32c.h#L40 kMaskDelta = 0xa282ead8 x = u32(crc32c(data)) return u32(((x >> 15) | u32(x << 17)) + kMaskDelta)
def _masked_crc32c(data): x = _u32(crc32c(data)) return _u32(((x >> 15) | _u32(x << 17)) + 0xA282EAD8)
def wfp_for_contents(file: str, contents: bytes): file_md5 = hashlib.md5(contents).hexdigest() # Print file line wfp = 'file={0},{1},{2}\n'.format(file_md5, len(contents), file) # We don't process snippets for binaries. if skip_snippets(contents.decode('utf-8', 'ignore'), file): return wfp # Initialize variables gram = "" window = [] normalized = 0 line = 1 min_hash = MAX_CRC32 last_hash = MAX_CRC32 last_line = 0 output = "" # Otherwise recurse src_content and calculate Winnowing hashes for byte in contents: if byte == ASCII_LF: line += 1 normalized = 0 else: normalized = normalize(byte) # Is it a useful byte? if normalized: # Add byte to gram gram += chr(normalized) # Do we have a full gram? if len(gram) >= GRAM: gram_crc32 = crc32c(gram.encode('ascii')) window.append(gram_crc32) # Do we have a full window? if len(window) >= WINDOW: # Select minimum hash for the current window min_hash = min(window) # Is the minimum hash a new one? if min_hash != last_hash: # Hashing the hash will result in a better balanced resulting data set # as it will counter the winnowing effect which selects the "minimum" # hash in each window crc = crc32c((min_hash).to_bytes(4, byteorder='little')) crc_hex = '{:08x}'.format(crc) if last_line != line: if output: wfp += output + '\n' output = "%d=%s" % (line, crc_hex) else: output += ',' + crc_hex last_line = line last_hash = min_hash # Shift window window.pop(0) # Shift gram gram = gram[1:] if output: wfp += output + '\n' return wfp
def wfp_for_file(file: str, path: str) -> str: """ Returns the WFP for a file by executing the winnowing algorithm over its contents. Parameters ---------- file: str The name of the file path : str The full contents of the file as a byte array. """ contents = None binary = False with open(path, 'rb') as f: contents = f.read() file_md5 = hashlib.md5(contents).hexdigest() # Print file line wfp = 'file={0},{1},{2}\n'.format(file_md5, len(contents), file) # We don't process snippets for binaries. if is_binary(path) or skip_snippets(contents.decode(), file): return wfp # Initialize variables gram = "" window = [] normalized = 0 line = 1 min_hash = MAX_CRC32 last_hash = MAX_CRC32 last_line = 0 output = "" # Otherwise recurse src_content and calculate Winnowing hashes for byte in contents: if byte == ASCII_LF: line += 1 normalized = 0 else: normalized = normalize(byte) # Is it a useful byte? if normalized: # Add byte to gram gram += chr(normalized) # Do we have a full gram? if len(gram) >= GRAM: gram_crc32 = crc32c(gram.encode('ascii')) window.append(gram_crc32) # Do we have a full window? if len(window) >= WINDOW: # Select minimum hash for the current window min_hash = min(window) # Is the minimum hash a new one? if min_hash != last_hash: # Hashing the hash will result in a better balanced resulting data set # as it will counter the winnowing effect which selects the "minimum" # hash in each window crc = crc32c((min_hash).to_bytes(4, byteorder='little')) crc_hex = '{:08x}'.format(crc) if last_line != line: if output: wfp += output + '\n' output = "%d=%s" % (line, crc_hex) else: output += ',' + crc_hex last_line = line last_hash = min_hash # Shift window window.pop(0) # Shift gram gram = gram[1:] if output: wfp += output + '\n' return wfp
def write(self, chunk): self._fileobj.write(chunk) self.crc32 = crc32c.crc32c(chunk, self.crc32)