def _parse_sections(stream: IO[bytes]) -> Iterable[SECTION_TYPES]: """ Helper function implementing the core logic for parsing sections. Among other things, this ensure that sections are correctly ordered and not duplicated (other than custom sections). """ start_pos = stream.tell() end_pos = stream.seek(0, 2) stream.seek(start_pos) # During section parsing sections may be omitted. The WASM spec says that # omitted sections are equivalent to them being present but empty. As we # parse the bytecode, we need to fill in any missing sections with their # empty equivalent. This iterator allows us to lazily step through the # sections in order. empty_section_iter = iter(EMPTY_SECTIONS_BY_ID) # A data structure to allow detection of duplicate sections. seen_section_ids: Set[int] = set() # We track missing sections separately. missing_section_ids: Set[int] = set() while stream.tell() < end_pos: section_id = parse_single_byte(stream) if section_id == numpy.uint8(0x00): yield parse_custom_section(stream) continue elif section_id not in PARSERS_BY_SECTION_ID: raise ParseError(f"Invalid section id: {hex(section_id)}") elif section_id in seen_section_ids: raise ParseError( f"Encountered multiple sections with the section id: " f"{hex(section_id)}") elif section_id in missing_section_ids: all_seen = tuple( sorted(seen_section_ids.union(missing_section_ids))) raise ParseError( f"Encountered section id out of order. section_id={section_id} " f"already encountered sections {all_seen}") seen_section_ids.add(section_id) for _, empty_section in _next_empty_section(section_id, empty_section_iter): missing_section_ids.add(section_id) yield empty_section section_parser_fn = PARSERS_BY_SECTION_ID[section_id] section = section_parser_fn(stream) yield section # get empty sections for any that were omitted. for _, empty_section in empty_section_iter: yield empty_section
def parse_null_byte(stream: IO[bytes]) -> None: """ Consume a single null byte from the stream Raise a ParseError if the stream is empty or if the consumed byte is not 0x00 """ byte = stream.read(1) if byte == b'\x00': return elif byte: raise ParseError(f"Expected 0x00 but got {hex(byte[0])}") else: raise ParseError("Unexpected end of stream")
def parse_single_byte(stream: IO[bytes]) -> UInt8: byte = stream.read(1) if byte: return UInt8(byte[0]) else: raise ParseError("Unexpected end of stream")
def parse_null_byte(stream: IO[bytes]) -> None: byte = stream.read(1) if byte == b'\x00': return elif byte: raise MalformedModule(f"TODO: expected 0x00 but got {hex(byte[0])}") else: raise ParseError("Unexpected end of stream")
def parse_bytes(stream: IO[bytes]) -> bytes: size = parse_u32(stream) data = stream.read(size) if len(data) != size: raise ParseError( f"Error parsing raw bytes. Expected bytestream of size {size}. " f"Parsed stream is of size {len(data)}" ) return data
def parse_single_byte(stream: IO[bytes]) -> numpy.uint8: """ Parses a single byte from the stream returning it as an 8-bit integer. """ byte = stream.read(1) if byte: return numpy.uint8(byte[0]) else: raise ParseError("Unexpected end of stream")
def parse_module(stream: IO[bytes]) -> Module: """ Parser for a binary encoded WebAssembly module. Return a Module object if successful. Raise a ParseError if an error is encountered. """ # `parse_magic` both parses and validates the 4-byte *magic* preamble. # Curretly we simply discard this value. parse_magic(stream) version = parse_version(stream) ( custom_sections, type_section, import_section, function_section, table_section, memory_section, global_section, export_section, start_section, element_segment_section, code_section, data_segment_section, ) = parse_sections(stream) if len(function_section.types) != len(code_section.codes): raise ParseError( "Mismatched lengths of function section and code section. " f"function-types[{len(function_section.types)}] != " f"codes[{len(code_section.codes)}]" ) functions = tuple( Function(type_idx, code.locals, code.expr) for type_idx, code in zip(function_section.types, code_section.codes) ) module = Module( version=version, types=type_section.function_types, funcs=functions, tables=table_section.tables, mems=memory_section.mems, globals=global_section.globals, elem=element_segment_section.element_segments, data=data_segment_section.data_segments, start=start_section.start, imports=import_section.imports, exports=export_section.exports, ) return module
def parse_vector( sub_parser: Callable[[IO[bytes]], TItem], stream: IO[bytes], ) -> Tuple[TItem, ...]: """ Parser for a vector of encoded values. """ vector_size = parse_u32(stream) try: return tuple(_parse_vector(sub_parser, vector_size, stream)) except Exception as err: raise ParseError(f"Error parsing vector: {err}") from err
def parse_blocktype(stream: IO[bytes]) -> Tuple[ValType, ...]: byte = parse_single_byte(stream) if byte == 0x40: return tuple() try: valtype = ValType.from_byte(byte) except ValueError as err: raise ParseError( f"Invalid byte while parsing mut. Got '{hex(byte)}: {str(err)}") return (valtype, )
def parse_and_validate_length_fn(stream: IO[bytes]) -> TReturn: # Note: Section parsers all operate under the assumption that their `stream` # contains **only** the bytes for the given section. It follows that # successful parsing for any section **must** consume the full stream. declared_size = parse_size(stream) raw_section = stream.read(declared_size) if len(raw_section) != declared_size: raise ParseError( "Section declared size larger than stream. " "declared={declared_size} actual={len(raw_section)}") section_stream = io.BytesIO(raw_section) section = parser_fn(section_stream) current_pos = section_stream.tell() end_pos = section_stream.seek(0, 2) if current_pos != end_pos: raise ParseError( f"Section parser did not fully consume section stream, leaving " f"{end_pos - current_pos} unconsumed bytes") return section
def parse_text(stream: IO[bytes]) -> str: encoded_name_length = parse_u32(stream) encoded_name = stream.read(encoded_name_length) if len(encoded_name) != encoded_name_length: raise ParseError( "Unexpected end of stream while parsing name. Expected length " f"{encoded_name_length}. Got '{encoded_name} with length " f"{len(encoded_name)}") try: name = encoded_name.decode('utf8') except UnicodeDecodeError as err: raise MalformedModule from err return name
def parse_version(stream: IO[bytes]) -> Tuple[UInt8, UInt8, UInt8, UInt8]: """ https://webassembly.github.io/spec/core/bikeshed/index.html#binary-version """ actual = ( parse_single_byte(stream), parse_single_byte(stream), parse_single_byte(stream), parse_single_byte(stream), ) if actual not in KNOWN_VERSIONS: raise ParseError( f"Unknown version. Got: " f"{tuple(hex(byte) for byte in actual)}" ) return actual
def parse_version(stream: IO[bytes]) -> Tuple[numpy.uint8, numpy.uint8, numpy.uint8, numpy.uint8]: """ Parser for the version portion of a binary encoded Web Assembly module https://webassembly.github.io/spec/core/bikeshed/index.html#binary-version """ actual = ( parse_single_byte(stream), parse_single_byte(stream), parse_single_byte(stream), parse_single_byte(stream), ) if actual not in KNOWN_VERSIONS: raise ParseError( f"Unknown version. Got: " f"{tuple(hex(byte) for byte in actual)}" ) return actual
def _parse_unsigned_leb128(stream: IO[bytes]) -> Iterable[int]: for shift in itertools.count(0, 7): if shift > SHIFT_64_BIT_MAX: raise Exception("TODO: better exception msg: Integer is too large...") byte = stream.read(1) try: value = byte[0] except IndexError: raise ParseError( "Unexpected end of stream while parsing LEB128 encoded integer" ) yield (value & LOW_MASK) << shift if not value & HIGH_MASK: break