def test05(self): """Test error from flex & bison book.""" filepath = os.path.join(TokenStreamTest.path, 'flex0.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) s = tok.data.decode('unicode_escape') self.assertEqual('Antenna', s[:7])
def test04(self): """Test token seek and tell.""" filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) # Memorize position after doing a first next_token(), this works xpos = tk.tell() tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(5, tok.data) tk.seek(xpos) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data)
def test_literal(self): """Test literal strings.""" filepath = os.path.join(TokenStreamTest.path, 'literal.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(17, len(b)) self.assertEqual('This', b[1:5])
def test_read_string(self): token_stream = TokenStream(InputStream('"ab"')) result = token_stream._read_string() self.assertEqual(Token('str', 'ab'), result) token_stream = TokenStream(InputStream('"ab\\c"')) result = token_stream._read_string() self.assertEqual(Token('str', 'abc'), result) token_stream = TokenStream(InputStream('"abc')) with self.assertRaises(Exception): token_stream._read_string()
def test_literal02(self): """Test escape sequences in literal strings.""" filepath = 't/literal02.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(2, len(b)) self.assertEqual(40, b[0]) self.assertEqual(41, b[1])
def main(): code = "sum = lambda(x, y) x + y; print(sum(2, 3));" code = """ fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2); time( λ() println(fib(25)) ); """ code = """ sum = lambda(n, ret) if n == 0 then ret else sum(n - 1, ret + n); time(lambda() println(sum(50000, 0))); """ code = """ println("foo"); halt(); println("bar"); """ global_env = Environment() for name, func in primitive.items(): global_env.define(name, func) with open(sys.argv[1]) as file: code = file.read() parser = Parser(TokenStream(InputStream(code))) execute(evaluate, (parser(), global_env, lambda result: print(f"*** Result: {result}")))
def main(): with open(sys.argv[1]) as file: code = file.read() # code = 'let foo(x = 1, y = 1) foo(x + y)' # code = 'lambda foo(x) x' parser = Parser(TokenStream(InputStream(code))) js_code = to_js(parser()) print(js_code)
def main(): with open(sys.argv[1]) as file: code = file.read() parser = Parser(TokenStream(InputStream(code))) cps_code = to_cps(parser(), lambda ast: CallAst( VarAst('β_TOPLEVEL'), [ast], )) print(cps_code)
def main(): global_env = Environment() for name, func in primitive.items(): global_env.define(name, func) lambda_file_path = sys.argv[1] with open(lambda_file_path) as file: code = file.read() parser = Parser(TokenStream(InputStream(code))) evaluate(parser(), global_env)
def main(): with open(sys.argv[1]) as file: code = file.read() parser = Parser(TokenStream(InputStream(code))) ast = parser() ast = to_cps(ast, lambda ast: CallAst(VarAst('β_TOPLEVEL'), [ast])) # print(ast) ast = Optimizer().optimize(ast) # print(ast) js_code = to_js(ast) print(js_code)
def test_read_number(self): token_stream = TokenStream(InputStream('123=')) result = token_stream._read_number() self.assertEqual(Token('num', 123.0), result) token_stream = TokenStream(InputStream('123.3.=')) result = token_stream._read_number() self.assertEqual(Token('num', 123.3), result)
def test_literal03(self): """Test escape sequences in literal strings.""" filepath = 't/literal03.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data for i in b: print(f'i="{i}"') self.assertEqual(9, len(b)) self.assertEqual(13, b[0]) # \r CR self.assertEqual(10, b[1]) # \n LF print(f'b[2]="{b[2]}"') self.assertEqual(8, b[2]) # \b BS self.assertEqual(9, b[3]) # \t TAB self.assertEqual(12, b[4]) self.assertEqual(40, b[5]) self.assertEqual(41, b[6]) self.assertEqual(0x5c, b[7]) self.assertEqual(83, b[8])
def test_to_cps(self): js_raw_ast = JsAst("aa") cps_ast = _cps_js_raw(js_raw_ast, lambda x: x) self.assertEqual(cps_ast, js_raw_ast) atom_ast = LiteralAst(1.0) cps_ast = to_cps(atom_ast, lambda x: x) self.assertEqual(atom_ast, cps_ast) let_ast = LetAst([], LiteralAst(False)) cps_ast = to_cps(let_ast, lambda x: x) self.assertEqual(cps_ast, LiteralAst(False)) prog_ast = ProgAst([]) cps_ast = to_cps(prog_ast, lambda x: x) self.assertEqual(cps_ast, LiteralAst(False)) prog_ast = ProgAst([LiteralAst(1)]) cps_ast = to_cps(prog_ast, lambda x: x) self.assertEqual(cps_ast, LiteralAst(1)) prog_ast = ProgAst([LiteralAst(1), LiteralAst(2)]) cps_ast = to_cps(prog_ast, lambda x: x) self.assertEqual(cps_ast, ProgAst([LiteralAst(1), LiteralAst(2)])) if_ast = IfAst(LiteralAst(1), LiteralAst(2), LiteralAst(3)) cps_ast: CallAst = to_cps(if_ast, lambda x: x) expected_ast = CallAst( LambdaAst( '', cps_ast.func.params, IfAst(LiteralAst(1), CallAst(VarAst(cps_ast.func.params[0]), [LiteralAst(2)]), CallAst(VarAst(cps_ast.func.params[0]), [LiteralAst(3)]))), [ LambdaAst('', cps_ast.args[0].params, VarAst(cps_ast.args[0].params[0])) ]) self.assertEqual(cps_ast, expected_ast) lambda_ast = LambdaAst('', ['x', 'y'], LiteralAst(1)) cps_ast = to_cps(lambda_ast, lambda x: x) expected_ast = LambdaAst( '', [cps_ast.params[0]] + ['x', 'y'], CallAst(VarAst(cps_ast.params[0]), [LiteralAst(1)])) self.assertEqual(cps_ast, expected_ast) binary_ast = BinaryAst('+', LiteralAst(1), LiteralAst(2)) cps_ast = to_cps(binary_ast, lambda x: x) self.assertEqual(cps_ast, binary_ast) parse = Parser(TokenStream(InputStream("a = foo(10);"))) cps_ast = to_cps(parse(), lambda x: x) expected_ast = CallAst(VarAst('foo'), [ LambdaAst( '', [cps_ast.args[0].params[0]], AssignAst(VarAst('a'), VarAst(cps_ast.args[0].params[0]))), LiteralAst(10) ]) self.assertEqual(cps_ast, expected_ast)
def parse_tokens(filepath): # Array for token storage tokens = [] # Parse a character stream into a token stream with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # tk.cc = tk.bf.next_byte() indent = 0 while True: t = tk.next_token() if t.type == EToken.EOF: break if t.type in [ EToken.ARRAY_END, EToken.DICT_END, EToken.OBJECT_END ]: indent -= 1 t.print_indented(indent) if t.type in [ EToken.ARRAY_BEGIN, EToken.DICT_BEGIN, EToken.OBJECT_BEGIN ]: indent += 1 tokens.append(t)
def main(): # code = "sum = lambda(x, y) x + y; print(sum(2, 3));" code = """ fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2); time( λ() println(fib(12)) ); """ # code = "print(1 + 2 * 3)" # code = """ # fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2); # println(fib(8)); # """ parser = Parser(TokenStream(InputStream(code))) global_env = Environment() for name, func in primitive.items(): global_env.define(name, func) evaluate(parser(), global_env, lambda result: result)
def test_is_whitespace(self): for whitespace in TokenStream.WHITESPACE: self.assertTrue(TokenStream.is_whitespace(whitespace)) self.assertFalse(TokenStream.is_whitespace('a'))
def test05(self): """Test token seek and tell.""" filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # Memorize position at the beginning, this bugs xpos = tk.tell() # tk.seek(0) # print(f'test05: seek(0): cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}') # tk.seek(1) # print(f'test05: seek(1), cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}') tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(5, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(19, tok.data) pos2 = tk.tell() # Go back tk.seek(xpos) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(98, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(73, tok.data) # Move forward tk.seek(pos2) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(18, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(33, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(45, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(66, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(13, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(2, tok.data) tok = tk.next_token() self.assertEqual(EToken.OBJ_REF, tok.type)
def test_read_while(self): token_stream = TokenStream(InputStream('ab123=')) result = token_stream._read_while(lambda ch: ch.isalnum()) self.assertEqual(result, 'ab123')
def test02(self): """Test simple next_token() calls.""" filepath = r't\token.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # [[[ tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) # <<>> >> tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) # ] tok = tk.next_token() self.assertEqual(EToken.ARRAY_END, tok.type) # /// tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'', tok.data) for i in range(6): tok = tk.next_token() # >>\r\n<< tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.CRLF, tok.type) tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) # /a tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'a', tok.data) # /b tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'b', tok.data) # /c tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'c', tok.data) # /d tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'd', tok.data) tok = tk.next_token() self.assertEqual(EToken.DICT_END, tok.type)
def __init__(self, filepath, f): self.tk = TokenStream(filepath, f) self.f = f self.tok = self.tk.next_token()
class ObjectStream: # Initializer def __init__(self, filepath, f): self.tk = TokenStream(filepath, f) self.f = f self.tok = self.tk.next_token() # The xref table will be a property of the object stream ? def seek(self, offset): self.tk.seek(offset) # Normal init self.tok = self.tk.next_token() #--------------------------------------------------------------------------- # get_indirect_obj_def #--------------------------------------------------------------------------- def get_indirect_obj_def(self): """Found the opening OBJECT_BEGIN token, now get the entire object.""" # self.tok has an EToken.OBJECT_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. tok = self.tok # Get the defined (internal) object self.tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) # Get the defined (internal) object obj = self.next_object() if obj.type in [EObject.ERROR, EObject.EOF]: return obj # self.tok holds the next token, read but not yet analyzed tok = self.tok # Ignore any end-if-line marker if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.OBJECT_END: return obj #--------------------------------------------------------------------------- # get_array #--------------------------------------------------------------------------- def get_array(self): """Found the opening ARRAY_BEGIN token, now get the entire array.""" # self.tok has an EToken.ARRAY_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # Prepare an array object arr = [] # FIXME shouldn't I ignore end-of-line characters ? tok = self.tk.next_token() while True: if tok.type == EToken.ARRAY_END: # It's a python array, but the elements are PdfObjects return PdfObject(EObject.ARRAY, arr) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Ignore end-if-line markers if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() continue self.tok = tok obj = self.next_object() # self.tok holds the next token, read but not yet analyzed if obj.type in [EObject.ERROR, EObject.EOF]: return obj # self.tok holds the next token, read but not yet analyzed tok = self.tok arr.append(obj) #--------------------------------------------------------------------------- # get_dictionary #--------------------------------------------------------------------------- def get_dictionary(self): """Found the opening DICT_BEGIN token, now get the entire dictionary.""" # self.tok has an EToken.DICT_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # Prepare a dictionary object d = {} tok = self.tk.next_token() while True: if tok.type == EToken.DICT_END: self.tok = tok # It's a python dictionary, but the values are PdfObjects return PdfObject(EObject.DICTIONARY, d) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Ignore end-if-line markers if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() elif tok.type == EToken.NAME: tok2 = self.tk.next_token() self.tok = tok2 obj = self.next_object() # FIXME: can any bytes object be decoded like this ? # FIXME: I've lost the keys' original bytes object d[tok.data.decode('unicode_escape')] = obj # The next token is already stored in self.tok, but it hasn't # been analyzed yet. tok = self.tok else: return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # get_stream #--------------------------------------------------------------------------- # FIXME define a proper stream class, with the dictionary in it def get_stream(self, length): """Found the opening STREAM_BEGIN token, now get all the data.""" # self.tok has an EToken.STREAM_BEGIN, parse the following tokens. # Return is done with the closing token (already analyzed) in self.tok. # FIXME I need to stop testing EOF and ERROR after every single # next_XXX() function call, use exceptions instead. # Get the token that follows 'stream' (CRLF or LF) tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # "The keyword stream that follows the stream dictionary shall be # followed by an end-of-line marker consisting of either a CARRIAGE # RETURN and a LINE FEED or just a LINE FEED, and not by a CARRIAGE # RETURN alone". PDF spec, § 7.3.8.1, page 19 if tok.type not in [EToken.LF, EToken.CRLF]: return PdfObject(EObject.ERROR) # Get the token with the stream data tok = self.tk.next_stream(length) if tok.type == EToken.EOF: return PdfObject(EObject.EOF) s = tok.data # "There should be an end-of-line marker after the data and before # endstream; this marker shall not be included in the stream length". # PDF spec, § 7.3.8.1, page 19 tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: return PdfObject(EObject.ERROR) # Get the closing STREAM_END tok = self.tk.next_token() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type != EToken.STREAM_END: return PdfObject(EObject.ERROR) # Return the stream data object, with the closing _END token return PdfObject(EObject.STREAM, data=s) #--------------------------------------------------------------------------- # deflate_stream #--------------------------------------------------------------------------- def deflate_stream(self, s, columns=None, predictor=None, W=None): """Decode stream s, encoded with flate, with predictor and W params.""" # s: original compressed data stream (stripped) # collumns: integer # predictor: integer with values in { 1, 2, 10-15 } # W: python array of integers # First, deflate the string zd = zlib.decompress(s) if not predictor: # No DecodeParms, so we assume no predictor # False means we have not done the un-predicting, just return zd return False, zd if predictor != 12: print(f'Predictor value {predictor} not supported (currently only 12)') return False, zd # From https://forums.adobe.com/thread/664902: "Strip off the last 10 # characters of the string. This is the CRC and is unnecessary to # extract the raw data". Not doing this, at this point. # Sum up the column widths. For the example above [1 2 1] would be # 4. This is one less than the number of bytes in each row. n = sum(W) # n == 4 width = n+1 # Split the string into rows by the column width: sum+1, or in our # example, 5. # Is the uncompressed stream length a multiple of this width ? if len(zd)%(width) == 0: print(f'*** Uncompressed len(zd)={len(zd)}, width={width}' + f', {len(zd)}={len(zd)//(width)}*{width}') else: print(f'*** Uncompressed len(zd)={len(zd)}, width={width}' + ', not a multiple') # zd is a bytes object prev = [0]*width nrows = len(zd)//(width) # 86 arr = [] for r in range(nrows): # 0..85 bs = '' rowdata = [x for x in zd[r*width:(r+1)*width]] # array of ints for i in range(1, width): rowdata[i] = (rowdata[i] + prev[i]) % 256 bs += format(rowdata[i], '08b') # Convert to binary string prev = rowdata # Update prev for next pass # Split the string according to W # print(f'{bs} len={len(bs)}') begin = 0 end = 8*W[0] type = int(bs[begin:end], 2) begin = 8*W[0] end = 8*(W[0] + W[1]) fld1 = int(bs[begin:end], 2) begin = 8*(W[0] + W[1]) end = begin + 8*W[2] fld2 = int(bs[begin:end], 2) arr.append((type, fld1, fld2)) # True means we have done the un-predicting, so what we return is an # array of 3-uples" return True, arr #--------------------------------------------------------------------------- # get_xref_section #--------------------------------------------------------------------------- def get_xref_section(self): """Parse a cross reference section into an object""" # self.tok has a EToken.XREF_SECTION, parse the following tokens. # "Each cross-reference section shall begin with a line containing the # keyword xref": this implies an end-of-line marker after 'xref' tok = self.tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: self.tok = tok # FIXME this way, self.tok will be analyzed again return PdfObject(EObject.ERROR) # Loop over cross-reference subsections self.xref_sec = XrefSection() while True: # Get a special token representing the sub-section header tok = self.tk.get_subsection_header() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) if tok.type == EToken.UNEXPECTED: # Couldn't parse the line as a sub-section header, this means # that the sub-section is over. The xref is stored as a # property of this ObjectSTream, and it is also returned. # State has been rolled back, so prepare to continue self.tok = self.tk.next_token() return PdfObject(EObject.XREF_SECTION, self.xref_sec) # Sub-section header was successfully parsed first_objn, entry_cnt = tok.data # I'm assuming entry_cnt is not 0. subs = XrefSubSection(first_objn, entry_cnt) for i in range(entry_cnt): # Get a special token representing a sub-section entry tok = self.tk.get_subsection_entry() if tok.type == EToken.EOF: return PdfObject(EObject.EOF) if tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) subs.entries.append(tok.data) # Finish off the this sub-section self.xref_sec.sub_sections.append(subs) #--------------------------------------------------------------------------- # get_cross_reference #--------------------------------------------------------------------------- def get_cross_reference(self): """Parse a cross reference section into an object""" # The current token from the stream should be either a XREF_SECTION # (for a traditional cross_reference table) or an INTEGER, introducing # an indirect object definition, for a cross-reference stream # (available in PDF 1.5 and later) tok = self.tok if tok.type == EToken.EOF: return PdfObject(EObject.EOF) # Traditional if tok.type == EToken.XREF_SECTION: return self.get_xref_section() # Available in PDF 1.5 and later if tok.type == EToken.INTEGER: obj = self.next_object() if obj.type == EObject.IND_OBJ_DEF: return obj # Any other case is an error, because we were expecting to find a # cross-reference table, modern or traditional. return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # next_object #--------------------------------------------------------------------------- def next_object(self): """Get the next object as a PdfObject.""" # Invariant: tok has been read from the stream, but not yet analyzed. It # is stored (persisted in between calls) in self.tok. This means that # every time control leaves this function (through return), it must # read, but not analyze, the next token, and store it in self.tok. tok = self.tok # Ignore CRLF (why do I parse the tokens then ?) while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tok = self.tk.next_token() # Have we reached EOF ? if tok.type == EToken.EOF: return PdfObject(EObject.EOF) elif tok.type == EToken.ERROR: return PdfObject(EObject.ERROR) elif tok.type == EToken.VERSION_MARKER: self.tok = self.tk.next_token() return PdfObject(EObject.VERSION_MARKER, data=tok.data) # Now analyze tok: is it a boolean ? elif tok.type == EToken.TRUE: self.tok = self.tk.next_token() return PdfObject(EObject.BOOLEAN, True) elif tok.type == EToken.FALSE: self.tok = self.tk.next_token() return PdfObject(EObject.BOOLEAN, False) # Is it an integer number ? elif tok.type == EToken.INTEGER: # Attempt to find the longest match first. Object definitions and # references are two integers plus another token, they must be # parsed first, and if not found, then we'll settle for the simple # integer. # Lookahead 1 token. If we find another integer, keep looking. # If we find an OBJECT_BEGIN, then we have an indirect object # definition. # If we find an OBJ_REF, then we have an indirect reference. pos = self.tk.tell() tok2 = self.tk.next_token() if tok2.type == EToken.INTEGER: # Keep looking tok3 = self.tk.next_token() if tok3.type == EToken.OBJECT_BEGIN: # Start creating the object with the object number (from # tok) and generation number (from tok2) # Get the defined (internal) object self.tok = tok3 obj = self.get_indirect_obj_def() if obj.type in [EObject.ERROR, EObject.EOF]: return obj self.tok = self.tk.next_token() return PdfObject(EObject.IND_OBJ_DEF, data=dict(obj=obj, objn=tok.data, gen=tok2.data)) elif tok3.type == EToken.OBJ_REF: # self.tk.next_token() # peeked tok2 # self.tk.next_token() # peeked tok3 self.tok = self.tk.next_token() return PdfObject(EObject.IND_OBJ_REF, data=dict(objn=tok.data, gen=tok2.data)) # Ignore tok2, we re-read it anyway self.tk.seek(pos) x = tok.data self.tok = self.tk.next_token() return PdfObject(EObject.INTEGER, x) # Is it a real number ? elif tok.type == EToken.REAL: self.tok = self.tk.next_token() return PdfObject(EObject.REAL, tok.data) # Is it a string ? elif tok.type in [EToken.LITERAL_STRING, EToken.HEX_STRING]: self.tok = self.tk.next_token() return PdfObject(EObject.STRING, tok.data) # bytearray # Is it a name ? elif tok.type == EToken.NAME: self.tok = self.tk.next_token() return PdfObject(EObject.NAME, tok.data) # bytearray # Is it an array ? elif tok.type == EToken.ARRAY_BEGIN: # self.tok already has the right value, tok was taken from there obj = self.get_array() # self.tok == ARRAY_END if obj.type in [EObject.ERROR, EObject.EOF]: return obj self.tok = self.tk.next_token() return obj # Is it a dictionary ? or a (dictionary, stream) couple ? elif tok.type == EToken.DICT_BEGIN: # self.tok already has the right value, tok was taken from there obj = self.get_dictionary() # self.tok == DICT_END if obj.type in [EObject.ERROR, EObject.EOF]: return obj while True: self.tok = self.tk.next_token() if self.tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break if self.tok.type != EToken.STREAM_BEGIN: return obj # return the dict # We have found a STREAM_BEGIN token, so 'obj' is the stream # dictionary # FIXME this may not be right. Length is given as an indirect # object ref, we must have parsed all the xref tables at this point # if we want to parse this stream. o = obj.data['Length'] if o.type == EObject.INTEGER: ln = o.data elif o.type == EObject.IND_OBJ_REF: ln = self.deref_object(o) else: return PdfObject(EObject.ERROR) obj2 = self.get_stream(ln) # FIXME use exceptions instead if obj2.type in [EObject.ERROR, EObject.EOF]: return obj2 self.tok = self.tk.next_token() return PdfObject(EObject.COUPLE, data=(obj, obj2)) # Is it a xref section ? elif tok.type == EToken.XREF_SECTION: obj = self.get_xref_section() # self.tok already holds the next token return obj # Is it a trailer ? elif tok.type == EToken.TRAILER: tok = self.tk.next_token() # Ignore CRLF (why do I parse the tokens then ?) while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]: tok = self.tk.next_token() if tok.type != EToken.DICT_BEGIN: # FIXME specify once and for all which token I want to see when # an error has been detected. The question is "how do I recover # from this error ?" self.tok = self.tk.next_token() return PdfObject(EObject.ERROR) obj = self.get_dictionary() self.tok = self.tk.next_token() return PdfObject(EObject.TRAILER, data=obj) elif tok.type == EToken.STARTXREF: self.tok = self.tk.next_token() return PdfObject(EObject.STARTXREF) elif tok.type == EToken.EOF_MARKER: self.tok = self.tk.next_token() return PdfObject(EObject.EOF_MARKER) # Is it a stream ? Wrong. Streams are preceded by a dictionary. elif tok.type == EToken.STREAM_BEGIN: return PdfObject(EObject.ERROR) # Is it null ? elif tok.type == EToken.NULL: self.tok = self.tk.next_token() return PdfObject(EObject.NULL) # Nothing that was expected here else: self.tok = self.tk.next_token() return PdfObject(EObject.ERROR) #--------------------------------------------------------------------------- # deref_object - read an indirect object from the file #--------------------------------------------------------------------------- def deref_object(self, o): """Find an object's definition from a reference.""" if o.type != EObject.IND_OBJ_REF: print(f'Expecting an indirect object reference, got "{o.type}"' + ' instead') return None if not self.xref_sec: return None # Now use objn to search the xref table for the file offset where # this catalog dictionary object can be found; seek the file to # that offset, and do another ob.next_object() # Catalog dictionary object is found at this offset, go there entry = self.xref_sec.get_object(o.data['objn'], o.data['gen']) if not entry: return None offset, _, _ = entry self.seek(offset) # Now read the next char, this will be the beginning of # "6082 0 obj^M<</Metadata 6125 0 R ..." where 6082 is the objn o = self.next_object() if o.type != EObject.IND_OBJ_DEF: print(f'Expecting an indirect object definition, got "{o.type}"' + ' instead') return None # The indirect object definition surrounds the object we want return o.data['obj']
def test_skip_comment(self): token_stream = TokenStream(InputStream('# abc\ndef')) token_stream._skip_comment() self.assertEqual(token_stream._input_stream.peek(), 'd')
def test_is_op_char(self): for operator in TokenStream.OPERATOR: self.assertTrue(TokenStream.is_operator(operator)) self.assertFalse(TokenStream.is_operator(';'))
def test_read_next(self): token_stream = TokenStream( InputStream(' # comment\n123 abc "nba" let a=2 >= js;')) self.assertEqual(token_stream._read_next(), Token('num', 123.0)) self.assertEqual(token_stream._read_next(), Token('var', 'abc')) self.assertEqual(token_stream._read_next(), Token('str', 'nba')) self.assertEqual(token_stream._read_next(), Token('kw', 'let')) self.assertEqual(token_stream._read_next(), Token('var', 'a=2')) self.assertEqual(token_stream._read_next(), Token('op', '>=')) self.assertEqual(token_stream._read_next(), Token('kw', 'js')) self.assertEqual(token_stream._read_next(), Token('punc', ';')) self.assertEqual(token_stream._read_next(), Token('null', 'null')) token_stream = TokenStream(InputStream('\x08')) with self.assertRaises(Exception): token_stream._read_next() token_stream = TokenStream(InputStream('λ (n) 1')) self.assertEqual(token_stream._read_next(), Token("kw", 'λ'))
def test_literal01(self): """Test the set of example strings from the spec.""" filepath = 't/literal01.dat' with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # This is a string tok = tk.next_token() self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(16, len(b)) self.assertEqual(b'This', b[0:4]) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Strings may contain newlines\n and such self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertTrue(b.startswith(b'Strings may')) self.assertTrue(b.endswith(b'such.')) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Strings may contain balanced parentheses... self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(b'(x)', b[41:44]) self.assertTrue(b.endswith(b'% and so on).')) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # The following is an empty string. self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(b'The following is an empty string.', b) while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # Empty string self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(0, len(b)) self.assertEqual(b'', b) # Skip over end of lines while True: tok = tk.next_token() if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]: break # It has zero (0) length. self.assertEqual(EToken.LITERAL_STRING, tok.type) b = tok.data self.assertEqual(23, len(b)) self.assertEqual(b'It has zero (0) length.', b)
def test_is_punc(self): for punc in TokenStream.PUNCTUATION: self.assertTrue(TokenStream.is_punctuation(punc)) self.assertFalse(TokenStream.is_punctuation('a'))
def test01(self): """Test simple next_token() and peek_token() calls.""" filepath = os.path.join(TokenStreamTest.path, 'token_stream.dat') with open(filepath, 'rb') as f: tk = TokenStream(filepath, f) # Retrieve a few tokens tok = tk.next_token() self.assertEqual(EToken.DICT_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'Contents', tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(6624, tok.data) # Now peek once tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) # Retrieve a peeked token tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) # Peek 3 tokens ahead tok2 = tk.peek_token() self.assertEqual(EToken.OBJ_REF, tok2.type) tok2 = tk.peek_token() self.assertEqual(EToken.NAME, tok2.type) self.assertEqual(b'CropBox', tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.ARRAY_BEGIN, tok2.type) # Retrieve 2 tokens tok = tk.next_token() self.assertEqual(EToken.OBJ_REF, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'CropBox', tok.data) # I still have the ARRAY_BEGIN in 'peeked' # I'm not sure this is the right spec... # Peeking 5 more tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.INTEGER, tok2.type) self.assertEqual(0, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.REAL, tok2.type) self.assertEqual(595.276, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.REAL, tok2.type) self.assertEqual(841.89, tok2.data) tok2 = tk.peek_token() self.assertEqual(EToken.ARRAY_END, tok2.type) # Retrieve 1 plus 5 plus 1 tok = tk.next_token() self.assertEqual(EToken.ARRAY_BEGIN, tok.type) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) tok = tk.next_token() self.assertEqual(EToken.INTEGER, tok.type) self.assertEqual(0, tok.data) tok = tk.next_token() self.assertEqual(EToken.REAL, tok.type) self.assertEqual(595.276, tok.data) tok = tk.next_token() self.assertEqual(EToken.REAL, tok.type) self.assertEqual(841.89, tok.data) tok = tk.next_token() self.assertEqual(EToken.ARRAY_END, tok.type) tok = tk.next_token() self.assertEqual(EToken.NAME, tok.type) self.assertEqual(b'MediaBox', tok.data)
def test_evaluate(self): ast = LiteralAst(1.0) environment = Environment() evaluate(ast, environment, lambda value: self.assertEqual(value, 1.0)) ast = LiteralAst(True) environment = Environment() evaluate(ast, environment, self.assertTrue) ast = LiteralAst(False) environment = Environment() evaluate(ast, environment, self.assertFalse) ast = LiteralAst("aaa") evaluate(ast, Environment(), lambda value: self.assertEqual(value, "aaa")) ast = BinaryAst('+', LiteralAst(1), LiteralAst(2)) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 3.0)) ast = ProgAst([]) evaluate(ast, Environment(), self.assertFalse) ast = ProgAst([LiteralAst(1)]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 1.0)) ast = ProgAst([LiteralAst(1), LiteralAst(2)]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 2.0)) ast = AssignAst(LiteralAst(1), LiteralAst("a")) with self.assertRaises(Exception): evaluate(ast, Environment(), lambda value: value) ast = ProgAst([AssignAst(VarAst('a'), LiteralAst("foo")), VarAst('a')]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, "foo")) ast = AssignAst(VarAst("a"), LiteralAst("foo")) with self.assertRaises(Exception): evaluate(ast, Environment(Environment()), lambda value: value) ast = CallAst( LambdaAst("", ["a"], VarAst("a")), [LiteralAst(1)], ) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 1.0)) ast = CallAst(LambdaAst("", ["a"], VarAst("a")), [LiteralAst("abc")]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, "abc")) # # (λ loop (n) if n > 0 then n + loop(n - 1) else 0) (10) ast = CallAst( LambdaAst( "loop", ["n"], IfAst( BinaryAst(">", VarAst("n"), LiteralAst(0)), BinaryAst( "+", VarAst("n"), CallAst(VarAst("loop"), [BinaryAst('-', VarAst('n'), LiteralAst(1))])), LiteralAst(0))), [LiteralAst(10)]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 55.0)) # # let (x) x; ast = LetAst([VarDefAst("x", None)], VarAst("x")) evaluate(ast, Environment(), self.assertFalse) # # let (x = 2, y = x + 1, z = x + y) x + y + z ast = LetAst([ VarDefAst("x", LiteralAst(2)), VarDefAst("y", BinaryAst("+", VarAst("x"), LiteralAst(1))), VarDefAst("z", BinaryAst("+", VarAst("x"), VarAst("y"))) ], BinaryAst("+", BinaryAst("+", VarAst("x"), VarAst("y")), VarAst("z"))) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 10.0)) # # the second expression will result an errors, # since x, y, z are bound to the let body # # let (x = 2, y = x + 1, z = x + y) x + y + z; x + y + z ast = ProgAst([ LetAst([ VarDefAst('x', LiteralAst(2)), VarDefAst('y', BinaryAst('+', VarAst('x'), LiteralAst(1))), VarDefAst('z', BinaryAst('+', VarAst('x'), VarAst('y'))) ], BinaryAst('+', BinaryAst('+', VarAst('x'), VarAst('y')), VarAst('z'))), BinaryAst('+', BinaryAst('+', VarAst('x'), VarAst('y')), VarAst('z')) ]) with self.assertRaises(Exception): evaluate(ast, Environment(), lambda value: value) ast = IfAst(LiteralAst(""), LiteralAst(1), None) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 1.0)) ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(2)) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 2.0)) ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(False)) evaluate(ast, Environment(), self.assertFalse) ast = {"type": "foo", "value": 'foo'} with self.assertRaises(Exception): evaluate(ast, Environment(), lambda value: value) # fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2); # fib(6); # ast = ProgAst([ AssignAst( VarAst('fib'), LambdaAst( 'n', ['n'], IfAst( BinaryAst('<', VarAst('n'), LiteralAst(2)), VarAst('n'), BinaryAst( '+', CallAst( VarAst('fib'), [BinaryAst('-', VarAst('n'), LiteralAst(1))]), CallAst( VarAst('fib'), [BinaryAst('-', VarAst('n'), LiteralAst(2)) ]))))), CallAst(VarAst('fib'), [LiteralAst(6)]) ]) evaluate(ast, Environment(), lambda value: self.assertEqual(value, 8.0)) ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(False)) evaluate(ast, Environment(), self.assertFalse) ast = CallAst(LiteralAst(1), []) with self.assertRaises(Exception): evaluate(ast, Environment(), self.assertFalse) code = """ 2 + twice(3, 4) """ global_env = Environment() for name, func in primitive.items(): global_env.define(name, func) parser = Parser(TokenStream(InputStream(code))) evaluate(parser(), global_env, lambda result: result)
if token.type != 'op': return left his_prec = self.PRECEDENCE[token.value] if his_prec > my_prec: self._token_stream.next() right = self._maybe_binary(self._parse_atom(), his_prec) if token.value == '=': binary = AssignAst(left, right) else: binary = BinaryAst(token.value, left, right) return self._maybe_binary(binary, my_prec) return left def __call__(self) -> ProgAst: return self._parse_toplevel() def unexpected(self): """ raise exception with error msg and error location whenever encountered error. """ self._token_stream.croak( f'Unexpected token: {self._token_stream.peek()}') if __name__ == '__main__': with open(sys.argv[1]) as f: code = f.read() ast = Parser(TokenStream(InputStream(code)))() print(ast)
def test_read_ident(self): token_stream = TokenStream(InputStream('a=1')) result = token_stream._read_identifier() self.assertEqual(Token('var', 'a=1'), result) token_stream = TokenStream(InputStream('a = 1')) result = token_stream._read_identifier() self.assertEqual(Token('var', 'a'), result) token_stream = TokenStream(InputStream('let(a = 1')) result = token_stream._read_identifier() self.assertEqual(Token('kw', 'let'), result) token_stream = TokenStream(InputStream('js "aaa"')) result = token_stream._read_identifier() self.assertEqual(Token('kw', 'js'), result)