    def test05(self):
        """Test error from flex & bison book."""
        filepath = os.path.join(TokenStreamTest.path, 'flex0.dat')
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            tok = tk.next_token()
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            s = tok.data.decode('unicode_escape')
            self.assertEqual('Antenna', s[:7])
    def test04(self):
        """Test token seek and tell."""
        filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat')
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(98, tok.data)

            # Memorize position after doing a first next_token(), this works
            xpos = tk.tell()

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(73, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(5, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(73, tok.data)
    def test_literal(self):
        """Test literal strings."""
        filepath = os.path.join(TokenStreamTest.path, 'literal.dat')
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            tok = tk.next_token()
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(17, len(b))
            self.assertEqual('This', b[1:5])
    def test_read_string(self):
        token_stream = TokenStream(InputStream('"ab"'))
        result = token_stream._read_string()
        self.assertEqual(Token('str', 'ab'), result)

        token_stream = TokenStream(InputStream('"ab\\c"'))
        result = token_stream._read_string()
        self.assertEqual(Token('str', 'abc'), result)

        token_stream = TokenStream(InputStream('"abc'))
        with self.assertRaises(Exception):
    def test_literal02(self):
        """Test escape sequences in literal strings."""
        filepath = 't/literal02.dat'
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # This is a string
            tok = tk.next_token()
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(2, len(b))
            self.assertEqual(40, b[0])
            self.assertEqual(41, b[1])
def main():
    code = "sum = lambda(x, y) x + y; print(sum(2, 3));"
    code = """
    fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2);
    time( λ() println(fib(25)) );
    code = """
    sum = lambda(n, ret)
            if n == 0 then ret
                    else sum(n - 1, ret + n);
    time(lambda() println(sum(50000, 0)));
    code = """
    global_env = Environment()

    for name, func in primitive.items():
        global_env.define(name, func)
    with open(sys.argv[1]) as file:
        code = file.read()
        parser = Parser(TokenStream(InputStream(code)))
        execute(evaluate, (parser(), global_env,
                           lambda result: print(f"*** Result: {result}")))
def main():
    with open(sys.argv[1]) as file:
        code = file.read()
    # code = 'let foo(x = 1, y = 1) foo(x + y)'
    # code = 'lambda foo(x) x'
    parser = Parser(TokenStream(InputStream(code)))
    js_code = to_js(parser())
def main():
    with open(sys.argv[1]) as file:
        code = file.read()
    parser = Parser(TokenStream(InputStream(code)))
    cps_code = to_cps(parser(), lambda ast: CallAst(
def main():
    global_env = Environment()
    for name, func in primitive.items():
        global_env.define(name, func)
    lambda_file_path = sys.argv[1]
    with open(lambda_file_path) as file:
        code = file.read()
    parser = Parser(TokenStream(InputStream(code)))
    evaluate(parser(), global_env)
def main():
    with open(sys.argv[1]) as file:
        code = file.read()
    parser = Parser(TokenStream(InputStream(code)))
    ast = parser()
    ast = to_cps(ast, lambda ast: CallAst(VarAst('β_TOPLEVEL'), [ast]))
    # print(ast)
    ast = Optimizer().optimize(ast)
    # print(ast)
    js_code = to_js(ast)
    def test_read_number(self):
        token_stream = TokenStream(InputStream('123='))
        result = token_stream._read_number()
        self.assertEqual(Token('num', 123.0), result)

        token_stream = TokenStream(InputStream('123.3.='))
        result = token_stream._read_number()
        self.assertEqual(Token('num', 123.3), result)
    def test_literal03(self):
        """Test escape sequences in literal strings."""
        filepath = 't/literal03.dat'
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # This is a string
            tok = tk.next_token()
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            for i in b:
            self.assertEqual(9, len(b))
            self.assertEqual(13, b[0])  # \r CR
            self.assertEqual(10, b[1])  # \n LF
            self.assertEqual(8, b[2])   # \b BS
            self.assertEqual(9, b[3])   # \t TAB
            self.assertEqual(12, b[4])
            self.assertEqual(40, b[5])
            self.assertEqual(41, b[6])
            self.assertEqual(0x5c, b[7])
            self.assertEqual(83, b[8])
 def test_to_cps(self):
     js_raw_ast = JsAst("aa")
     cps_ast = _cps_js_raw(js_raw_ast, lambda x: x)
     self.assertEqual(cps_ast, js_raw_ast)
     atom_ast = LiteralAst(1.0)
     cps_ast = to_cps(atom_ast, lambda x: x)
     self.assertEqual(atom_ast, cps_ast)
     let_ast = LetAst([], LiteralAst(False))
     cps_ast = to_cps(let_ast, lambda x: x)
     self.assertEqual(cps_ast, LiteralAst(False))
     prog_ast = ProgAst([])
     cps_ast = to_cps(prog_ast, lambda x: x)
     self.assertEqual(cps_ast, LiteralAst(False))
     prog_ast = ProgAst([LiteralAst(1)])
     cps_ast = to_cps(prog_ast, lambda x: x)
     self.assertEqual(cps_ast, LiteralAst(1))
     prog_ast = ProgAst([LiteralAst(1), LiteralAst(2)])
     cps_ast = to_cps(prog_ast, lambda x: x)
     self.assertEqual(cps_ast, ProgAst([LiteralAst(1), LiteralAst(2)]))
     if_ast = IfAst(LiteralAst(1), LiteralAst(2), LiteralAst(3))
     cps_ast: CallAst = to_cps(if_ast, lambda x: x)
     expected_ast = CallAst(
             '', cps_ast.func.params,
                   CallAst(VarAst(cps_ast.func.params[0]), [LiteralAst(2)]),
                           [LiteralAst(3)]))), [
                               LambdaAst('', cps_ast.args[0].params,
     self.assertEqual(cps_ast, expected_ast)
     lambda_ast = LambdaAst('', ['x', 'y'], LiteralAst(1))
     cps_ast = to_cps(lambda_ast, lambda x: x)
     expected_ast = LambdaAst(
         '', [cps_ast.params[0]] + ['x', 'y'],
         CallAst(VarAst(cps_ast.params[0]), [LiteralAst(1)]))
     self.assertEqual(cps_ast, expected_ast)
     binary_ast = BinaryAst('+', LiteralAst(1), LiteralAst(2))
     cps_ast = to_cps(binary_ast, lambda x: x)
     self.assertEqual(cps_ast, binary_ast)
     parse = Parser(TokenStream(InputStream("a = foo(10);")))
     cps_ast = to_cps(parse(), lambda x: x)
     expected_ast = CallAst(VarAst('foo'), [
             '', [cps_ast.args[0].params[0]],
             AssignAst(VarAst('a'), VarAst(cps_ast.args[0].params[0]))),
     self.assertEqual(cps_ast, expected_ast)
def parse_tokens(filepath):
    # Array for token storage
    tokens = []

    # Parse a character stream into a token stream
    with open(filepath, 'rb') as f:
        tk = TokenStream(filepath, f)
        # tk.cc = tk.bf.next_byte()
        indent = 0
        while True:
            t = tk.next_token()
            if t.type == EToken.EOF:
            if t.type in [
                    EToken.ARRAY_END, EToken.DICT_END, EToken.OBJECT_END
                indent -= 1
            if t.type in [
                    EToken.ARRAY_BEGIN, EToken.DICT_BEGIN, EToken.OBJECT_BEGIN
                indent += 1

def main():
    # code = "sum = lambda(x, y) x + y; print(sum(2, 3));"
    code = """
    fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2);
    time( λ() println(fib(12)) );
    # code = "print(1 + 2 * 3)"
    # code = """
    # fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2);
    # println(fib(8));
    # """
    parser = Parser(TokenStream(InputStream(code)))
    global_env = Environment()
    for name, func in primitive.items():
        global_env.define(name, func)
    evaluate(parser(), global_env, lambda result: result)
 def test_is_whitespace(self):
     for whitespace in TokenStream.WHITESPACE:
    def test05(self):
        """Test token seek and tell."""
        filepath = os.path.join(TokenStreamTest.path, 'obj_stream3.dat')
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # Memorize position at the beginning, this bugs
            xpos = tk.tell()

            # tk.seek(0)
            # print(f'test05: seek(0): cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}')

            # tk.seek(1)
            # print(f'test05: seek(1), cc="{chr(tk.cc)}", bf.s_pos={tk.bf.s_pos}')

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(98, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(73, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(5, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(19, tok.data)

            pos2 = tk.tell()

            # Go back

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(98, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(73, tok.data)

            # Move forward

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(18, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(33, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(45, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(66, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(13, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(2, tok.data)

            tok = tk.next_token()
            self.assertEqual(EToken.OBJ_REF, tok.type)
 def test_read_while(self):
     token_stream = TokenStream(InputStream('ab123='))
     result = token_stream._read_while(lambda ch: ch.isalnum())
     self.assertEqual(result, 'ab123')
    def test02(self):
        """Test simple next_token() calls."""
        filepath = r't\token.dat'
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # [[[
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_BEGIN, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_BEGIN, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_BEGIN, tok.type)

            # <<>> >>
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_BEGIN, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_END, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_END, tok.type)

            # ]
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_END, tok.type)
            # /// 
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'', tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'', tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'', tok.data)

            for i in range(6):
                tok = tk.next_token()

            # >>\r\n<<
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_END, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.CRLF, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_BEGIN, tok.type)

            # /a
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'a', tok.data)

            # /b
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'b', tok.data)
            # /c
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'c', tok.data)
            # /d
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'd', tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_END, tok.type)
 def __init__(self, filepath, f):
     self.tk = TokenStream(filepath, f)
     self.f = f
     self.tok = self.tk.next_token()
class ObjectStream:

    # Initializer
    def __init__(self, filepath, f):
        self.tk = TokenStream(filepath, f)
        self.f = f
        self.tok = self.tk.next_token()

        # The xref table will be a property of the object stream ?

    def seek(self, offset):
        # Normal init
        self.tok = self.tk.next_token()
    # get_indirect_obj_def
    def get_indirect_obj_def(self):
        """Found the opening OBJECT_BEGIN token, now get the entire object."""
        # self.tok has an EToken.OBJECT_BEGIN, parse the following tokens.
        # Return is done with the closing token (already analyzed) in self.tok.
        tok = self.tok

        # Get the defined (internal) object
        self.tok = self.tk.next_token()
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)
        elif tok.type == EToken.ERROR:
            return PdfObject(EObject.ERROR)

        # Get the defined (internal) object
        obj = self.next_object()
        if obj.type in [EObject.ERROR, EObject.EOF]:
            return obj
        # self.tok holds the next token, read but not yet analyzed
        tok = self.tok

        # Ignore any end-if-line marker
        if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]:
            tok = self.tk.next_token()
            if tok.type == EToken.EOF:
                return PdfObject(EObject.EOF)
            elif tok.type == EToken.ERROR:
                return PdfObject(EObject.ERROR)
        if tok.type == EToken.OBJECT_END:
            return obj
    # get_array
    def get_array(self):
        """Found the opening ARRAY_BEGIN token, now get the entire array."""
        # self.tok has an EToken.ARRAY_BEGIN, parse the following tokens.
        # Return is done with the closing token (already analyzed) in self.tok.

        # Prepare an array object
        arr = []

        # FIXME shouldn't I ignore end-of-line characters ?

        tok = self.tk.next_token()
        while True:
            if tok.type == EToken.ARRAY_END:
                # It's a python array, but the elements are PdfObjects
                return PdfObject(EObject.ARRAY, arr)
            if tok.type == EToken.ERROR:
                return PdfObject(EObject.ERROR)
            if tok.type == EToken.EOF:
                return PdfObject(EObject.EOF)
            # Ignore end-if-line markers
            if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]:
                tok = self.tk.next_token()
            self.tok = tok
            obj = self.next_object()
            # self.tok holds the next token, read but not yet analyzed
            if obj.type in [EObject.ERROR, EObject.EOF]:
                return obj
            # self.tok holds the next token, read but not yet analyzed
            tok = self.tok

    # get_dictionary
    def get_dictionary(self):
        """Found the opening DICT_BEGIN token, now get the entire dictionary."""
        # self.tok has an EToken.DICT_BEGIN, parse the following tokens.
        # Return is done with the closing token (already analyzed) in self.tok.

        # Prepare a dictionary object
        d = {}

        tok = self.tk.next_token()
        while True:
            if tok.type == EToken.DICT_END:
                self.tok = tok
                # It's a python dictionary, but the values are PdfObjects
                return PdfObject(EObject.DICTIONARY, d)
            if tok.type == EToken.ERROR:
                return PdfObject(EObject.ERROR)
            if tok.type == EToken.EOF:
                return PdfObject(EObject.EOF)
            # Ignore end-if-line markers
            if tok.type in [EToken.CR, EToken.LF, EToken.CRLF]:
                tok = self.tk.next_token()
            elif tok.type == EToken.NAME:
                tok2 = self.tk.next_token()
                self.tok = tok2
                obj = self.next_object()
                # FIXME: can any bytes object be decoded like this ?
                # FIXME: I've lost the keys' original bytes object
                d[tok.data.decode('unicode_escape')] = obj            

                # The next token is already stored in self.tok, but it hasn't
                # been analyzed yet.
                tok = self.tok
                return PdfObject(EObject.ERROR)
    # get_stream

    # FIXME define a proper stream class, with the dictionary in it
    def get_stream(self, length):
        """Found the opening STREAM_BEGIN token, now get all the data."""
        # self.tok has an EToken.STREAM_BEGIN, parse the following tokens.
        # Return is done with the closing token (already analyzed) in self.tok.

        # FIXME I need to stop testing EOF and ERROR after every single
        # next_XXX() function call, use exceptions instead.

        # Get the token that follows 'stream' (CRLF or LF)
        tok = self.tk.next_token()
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)

        # "The keyword stream that follows the stream dictionary shall be
        # followed by an end-of-line marker consisting of either a CARRIAGE
        # RETURN and a LINE FEED or just a LINE FEED, and not by a CARRIAGE
        # RETURN alone". PDF spec, §, page 19
        if tok.type not in [EToken.LF, EToken.CRLF]:
            return PdfObject(EObject.ERROR)

        # Get the token with the stream data
        tok = self.tk.next_stream(length)
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)
        s = tok.data

        # "There should be an end-of-line marker after the data and before
        # endstream; this marker shall not be included in the stream length".
        # PDF spec, §, page 19
        tok = self.tk.next_token()
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)
        if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:
            return PdfObject(EObject.ERROR)

        # Get the closing STREAM_END
        tok = self.tk.next_token()
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)
        if tok.type != EToken.STREAM_END:
            return PdfObject(EObject.ERROR)

        # Return the stream data object, with the closing _END token 
        return PdfObject(EObject.STREAM, data=s)
    # deflate_stream
    def deflate_stream(self, s, columns=None, predictor=None, W=None):
        """Decode stream s, encoded with flate, with predictor and W params."""
        # s: original compressed data stream (stripped)
        # collumns: integer
        # predictor: integer with values in { 1, 2, 10-15 }
        # W: python array of integers

        # First, deflate the string
        zd = zlib.decompress(s)
        if not predictor:
            # No DecodeParms, so we assume no predictor
            # False means we have not done the un-predicting, just return zd
            return False, zd

        if predictor != 12:
            print(f'Predictor value {predictor} not supported (currently only 12)')
            return False, zd

        # From https://forums.adobe.com/thread/664902: "Strip off the last 10
        # characters of the string. This is the CRC and is unnecessary to
        # extract the raw data". Not doing this, at this point.

        # Sum up the column widths. For the example above [1 2 1] would be
        # 4. This is one less than the number of bytes in each row.
        n = sum(W)  # n == 4
        width = n+1

        # Split the string into rows by the column width: sum+1, or in our
        # example, 5.

        # Is the uncompressed stream length a multiple of this width ?
        if len(zd)%(width) == 0:
            print(f'*** Uncompressed len(zd)={len(zd)}, width={width}'
                  + f', {len(zd)}={len(zd)//(width)}*{width}')
            print(f'*** Uncompressed len(zd)={len(zd)}, width={width}'
                  + ', not a multiple')

        # zd is a bytes object
        prev = [0]*width
        nrows = len(zd)//(width)  # 86
        arr = []
        for r in range(nrows):  # 0..85
            bs = ''
            rowdata = [x for x in zd[r*width:(r+1)*width]]  # array of ints
            for i in range(1, width):
                rowdata[i] = (rowdata[i] + prev[i]) % 256
                bs += format(rowdata[i], '08b')  # Convert to binary string
            prev = rowdata  # Update prev for next pass
            # Split the string according to W
            # print(f'{bs} len={len(bs)}')

            begin = 0
            end = 8*W[0]
            type = int(bs[begin:end], 2)

            begin = 8*W[0]
            end = 8*(W[0] + W[1])
            fld1 = int(bs[begin:end], 2)

            begin = 8*(W[0] + W[1])
            end = begin + 8*W[2]
            fld2 = int(bs[begin:end], 2)
            arr.append((type, fld1, fld2))

        # True means we have done the un-predicting, so what we return is an
        # array of 3-uples"
        return True, arr

    # get_xref_section

    def get_xref_section(self):
        """Parse a cross reference section into an object"""
        # self.tok has a EToken.XREF_SECTION, parse the following tokens.

        # "Each cross-reference section shall begin with a line containing the
        # keyword xref": this implies an end-of-line marker after 'xref'
        tok = self.tk.next_token()
        if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:
            self.tok = tok  # FIXME this way, self.tok will be analyzed again
            return PdfObject(EObject.ERROR)

        # Loop over cross-reference subsections
        self.xref_sec = XrefSection()
        while True:
            # Get a special token representing the sub-section header
            tok = self.tk.get_subsection_header()
            if tok.type == EToken.EOF:
                return PdfObject(EObject.EOF)
            if tok.type == EToken.ERROR:
                return PdfObject(EObject.ERROR)
            if tok.type == EToken.UNEXPECTED:
                # Couldn't parse the line as a sub-section header, this means
                # that the sub-section is over.  The xref is stored as a
                # property of this ObjectSTream, and it is also returned.
                # State has been rolled back, so prepare to continue
                self.tok = self.tk.next_token()
                return PdfObject(EObject.XREF_SECTION, self.xref_sec)

            # Sub-section header was successfully parsed
            first_objn, entry_cnt = tok.data

            # I'm assuming entry_cnt is not 0.
            subs = XrefSubSection(first_objn, entry_cnt)
            for i in range(entry_cnt):
                # Get a special token representing a sub-section entry
                tok = self.tk.get_subsection_entry()
                if tok.type == EToken.EOF:
                    return PdfObject(EObject.EOF)
                if tok.type == EToken.ERROR:
                    return PdfObject(EObject.ERROR)

            # Finish off the this sub-section
    # get_cross_reference

    def get_cross_reference(self):
        """Parse a cross reference section into an object"""
        # The current token from the stream should be either a XREF_SECTION
        # (for a traditional cross_reference table) or an INTEGER, introducing
        # an indirect object definition, for a cross-reference stream
        # (available in PDF 1.5 and later)
        tok = self.tok

        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)

        # Traditional
        if tok.type == EToken.XREF_SECTION:
            return self.get_xref_section()

        # Available in PDF 1.5 and later
        if tok.type == EToken.INTEGER:
            obj = self.next_object()
            if obj.type == EObject.IND_OBJ_DEF:
                return obj

        # Any other case is an error, because we were expecting to find a
        # cross-reference table, modern or traditional.
        return PdfObject(EObject.ERROR)

    # next_object
    def next_object(self):
        """Get the next object as a PdfObject."""
        # Invariant: tok has been read from the stream, but not yet analyzed. It
        # is stored (persisted in between calls) in self.tok. This means that
        # every time control leaves this function (through return), it must
        # read, but not analyze, the next token, and store it in self.tok.
        tok = self.tok

        # Ignore CRLF (why do I parse the tokens then ?)
        while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]:
            tok = self.tok = self.tk.next_token()
        # Have we reached EOF ?
        if tok.type == EToken.EOF:
            return PdfObject(EObject.EOF)
        elif tok.type == EToken.ERROR:
            return PdfObject(EObject.ERROR)
        elif tok.type == EToken.VERSION_MARKER:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.VERSION_MARKER, data=tok.data)

        # Now analyze tok: is it a boolean ?
        elif tok.type == EToken.TRUE:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.BOOLEAN, True)
        elif tok.type == EToken.FALSE:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.BOOLEAN, False)

        # Is it an integer number ?
        elif tok.type == EToken.INTEGER:
            # Attempt to find the longest match first. Object definitions and
            # references are two integers plus another token, they must be
            # parsed first, and if not found, then we'll settle for the simple
            # integer.
            # Lookahead 1 token. If we find another integer, keep looking.
            # If we find an OBJECT_BEGIN, then we have an indirect object
            # definition.
            # If we find an OBJ_REF, then we have an indirect reference.
            pos = self.tk.tell()
            tok2 = self.tk.next_token()
            if tok2.type == EToken.INTEGER:
                # Keep looking
                tok3 = self.tk.next_token()
                if tok3.type == EToken.OBJECT_BEGIN:
                    # Start creating the object with the object number (from
                    # tok) and generation number (from tok2)
                    # Get the defined (internal) object
                    self.tok = tok3
                    obj = self.get_indirect_obj_def()
                    if obj.type in [EObject.ERROR, EObject.EOF]:
                        return obj
                    self.tok = self.tk.next_token()
                    return PdfObject(EObject.IND_OBJ_DEF,
                                     data=dict(obj=obj, objn=tok.data, gen=tok2.data))
                elif tok3.type == EToken.OBJ_REF:
                    # self.tk.next_token()  # peeked tok2
                    # self.tk.next_token()  # peeked tok3
                    self.tok = self.tk.next_token()
                    return PdfObject(EObject.IND_OBJ_REF,
                                     data=dict(objn=tok.data, gen=tok2.data))
            # Ignore tok2, we re-read it anyway
            x = tok.data
            self.tok = self.tk.next_token()
            return PdfObject(EObject.INTEGER, x)

        # Is it a real number ?
        elif tok.type == EToken.REAL:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.REAL, tok.data)

        # Is it a string ?
        elif tok.type in [EToken.LITERAL_STRING, EToken.HEX_STRING]:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.STRING, tok.data)  # bytearray

        # Is it a name ?
        elif tok.type == EToken.NAME:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.NAME, tok.data)  # bytearray

        # Is it an array ?
        elif tok.type == EToken.ARRAY_BEGIN:
            # self.tok already has the right value, tok was taken from there
            obj = self.get_array()
            # self.tok == ARRAY_END
            if obj.type in [EObject.ERROR, EObject.EOF]:
                return obj
            self.tok = self.tk.next_token()
            return obj

        # Is it a dictionary ? or a (dictionary, stream) couple ?
        elif tok.type == EToken.DICT_BEGIN:
            # self.tok already has the right value, tok was taken from there
            obj = self.get_dictionary()
            # self.tok == DICT_END
            if obj.type in [EObject.ERROR, EObject.EOF]:
                return obj
            while True:
                self.tok = self.tk.next_token()
                if self.tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:
            if self.tok.type != EToken.STREAM_BEGIN:
                return obj  # return the dict

            # We have found a STREAM_BEGIN token, so 'obj' is the stream
            # dictionary
            # FIXME this may not be right. Length is given as an indirect
            # object ref, we must have parsed all the xref tables at this point
            # if we want to parse this stream.
            o = obj.data['Length']
            if o.type == EObject.INTEGER:
                ln = o.data
            elif o.type == EObject.IND_OBJ_REF:
                ln = self.deref_object(o)
                return PdfObject(EObject.ERROR)
            obj2 = self.get_stream(ln)
            # FIXME use exceptions instead
            if obj2.type in [EObject.ERROR, EObject.EOF]:
                return obj2
            self.tok = self.tk.next_token()
            return PdfObject(EObject.COUPLE, data=(obj, obj2))

        # Is it a xref section ?
        elif tok.type == EToken.XREF_SECTION:
            obj = self.get_xref_section()
            # self.tok already holds the next token
            return obj

        # Is it a trailer ?
        elif tok.type == EToken.TRAILER:
            tok = self.tk.next_token()
            # Ignore CRLF (why do I parse the tokens then ?)
            while tok.type in [EToken.CR, EToken.LF, EToken.CRLF]:
                tok = self.tk.next_token()
            if tok.type != EToken.DICT_BEGIN:
                # FIXME specify once and for all which token I want to see when
                # an error has been detected. The question is "how do I recover
                # from this error ?"
                self.tok = self.tk.next_token()
                return PdfObject(EObject.ERROR)
            obj = self.get_dictionary()
            self.tok = self.tk.next_token()
            return PdfObject(EObject.TRAILER, data=obj)

        elif tok.type == EToken.STARTXREF:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.STARTXREF)

        elif tok.type == EToken.EOF_MARKER:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.EOF_MARKER)

        # Is it a stream ? Wrong. Streams are preceded by a dictionary.
        elif tok.type == EToken.STREAM_BEGIN:
            return PdfObject(EObject.ERROR)

        # Is it null ?
        elif tok.type == EToken.NULL:
            self.tok = self.tk.next_token()
            return PdfObject(EObject.NULL)

        # Nothing that was expected here
            self.tok = self.tk.next_token()
            return PdfObject(EObject.ERROR)

    # deref_object - read an indirect object from the file

    def deref_object(self, o):
        """Find an object's definition from a reference."""
        if o.type != EObject.IND_OBJ_REF:
            print(f'Expecting an indirect object reference, got "{o.type}"'
                  + ' instead')
            return None

        if not self.xref_sec:
            return None

        # Now use objn to search the xref table for the file offset where
        # this catalog dictionary object can be found; seek the file to
        # that offset, and do another ob.next_object()

        # Catalog dictionary object is found at this offset, go there
        entry = self.xref_sec.get_object(o.data['objn'], o.data['gen'])
        if not entry:
            return None
        offset, _, _ = entry

        # Now read the next char, this will be the beginning of
        # "6082 0 obj^M<</Metadata 6125 0 R ..." where 6082 is the objn
        o = self.next_object()
        if o.type != EObject.IND_OBJ_DEF:
            print(f'Expecting an indirect object definition, got "{o.type}"'
                  + ' instead')
            return None

        # The indirect object definition surrounds the object we want
        return o.data['obj']
 def test_skip_comment(self):
     token_stream = TokenStream(InputStream('# abc\ndef'))
     self.assertEqual(token_stream._input_stream.peek(), 'd')
 def test_is_op_char(self):
     for operator in TokenStream.OPERATOR:
    def test_read_next(self):
        token_stream = TokenStream(
            InputStream(' # comment\n123 abc "nba" let a=2  >= js;'))
        self.assertEqual(token_stream._read_next(), Token('num', 123.0))
        self.assertEqual(token_stream._read_next(), Token('var', 'abc'))
        self.assertEqual(token_stream._read_next(), Token('str', 'nba'))
        self.assertEqual(token_stream._read_next(), Token('kw', 'let'))
        self.assertEqual(token_stream._read_next(), Token('var', 'a=2'))
        self.assertEqual(token_stream._read_next(), Token('op', '>='))
        self.assertEqual(token_stream._read_next(), Token('kw', 'js'))
        self.assertEqual(token_stream._read_next(), Token('punc', ';'))
        self.assertEqual(token_stream._read_next(), Token('null', 'null'))
        token_stream = TokenStream(InputStream('\x08'))
        with self.assertRaises(Exception):

        token_stream = TokenStream(InputStream('λ (n) 1'))
        self.assertEqual(token_stream._read_next(), Token("kw", 'λ'))
    def test_literal01(self):
        """Test the set of example strings from the spec."""
        filepath = 't/literal01.dat'
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # This is a string
            tok = tk.next_token()
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(16, len(b))
            self.assertEqual(b'This', b[0:4])

            # Skip over end of lines
            while True:
                tok = tk.next_token()
                if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:

            # Strings may contain newlines\n and such
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertTrue(b.startswith(b'Strings may'))

            # Skip over end of lines
            while True:
                tok = tk.next_token()
                if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:
            # Strings may contain balanced parentheses...
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(b'(x)', b[41:44])
            self.assertTrue(b.endswith(b'% and so on).'))

            # Skip over end of lines
            while True:
                tok = tk.next_token()
                if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:

            # The following is an empty string.
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(b'The following is an empty string.', b)
            while True:
                tok = tk.next_token()
                if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:

            # Empty string
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(0, len(b))
            self.assertEqual(b'', b)

            # Skip over end of lines
            while True:
                tok = tk.next_token()
                if tok.type not in [EToken.CR, EToken.LF, EToken.CRLF]:

            # It has zero (0) length.
            self.assertEqual(EToken.LITERAL_STRING, tok.type)
            b = tok.data
            self.assertEqual(23, len(b))
            self.assertEqual(b'It has zero (0) length.', b)
 def test_is_punc(self):
     for punc in TokenStream.PUNCTUATION:
    def test01(self):
        """Test simple next_token() and peek_token() calls."""
        filepath = os.path.join(TokenStreamTest.path, 'token_stream.dat')
        with open(filepath, 'rb') as f:
            tk = TokenStream(filepath, f)

            # Retrieve a few tokens
            tok = tk.next_token()
            self.assertEqual(EToken.DICT_BEGIN, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'Contents', tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(6624, tok.data)

            # Now peek once
            tok2 = tk.peek_token()
            self.assertEqual(EToken.INTEGER, tok2.type)
            self.assertEqual(0, tok2.data)

            # Retrieve a peeked token
            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(0, tok.data)

            # Peek 3 tokens ahead
            tok2 = tk.peek_token()
            self.assertEqual(EToken.OBJ_REF, tok2.type)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.NAME, tok2.type)
            self.assertEqual(b'CropBox', tok2.data)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.ARRAY_BEGIN, tok2.type)

            # Retrieve 2 tokens
            tok = tk.next_token()
            self.assertEqual(EToken.OBJ_REF, tok.type)
            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'CropBox', tok.data)

            # I still have the ARRAY_BEGIN in 'peeked'

            # I'm not sure this is the right spec... 

            # Peeking 5 more
            tok2 = tk.peek_token()
            self.assertEqual(EToken.INTEGER, tok2.type)
            self.assertEqual(0, tok2.data)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.INTEGER, tok2.type)
            self.assertEqual(0, tok2.data)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.REAL, tok2.type)
            self.assertEqual(595.276, tok2.data)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.REAL, tok2.type)
            self.assertEqual(841.89, tok2.data)
            tok2 = tk.peek_token()
            self.assertEqual(EToken.ARRAY_END, tok2.type)

            # Retrieve 1 plus 5 plus 1
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_BEGIN, tok.type)

            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(0, tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.INTEGER, tok.type)
            self.assertEqual(0, tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.REAL, tok.type)
            self.assertEqual(595.276, tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.REAL, tok.type)
            self.assertEqual(841.89, tok.data)
            tok = tk.next_token()
            self.assertEqual(EToken.ARRAY_END, tok.type)

            tok = tk.next_token()
            self.assertEqual(EToken.NAME, tok.type)
            self.assertEqual(b'MediaBox', tok.data)
 def test_evaluate(self):
     ast = LiteralAst(1.0)
     environment = Environment()
     evaluate(ast, environment, lambda value: self.assertEqual(value, 1.0))
     ast = LiteralAst(True)
     environment = Environment()
     evaluate(ast, environment, self.assertTrue)
     ast = LiteralAst(False)
     environment = Environment()
     evaluate(ast, environment, self.assertFalse)
     ast = LiteralAst("aaa")
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, "aaa"))
     ast = BinaryAst('+', LiteralAst(1), LiteralAst(2))
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 3.0))
     ast = ProgAst([])
     evaluate(ast, Environment(), self.assertFalse)
     ast = ProgAst([LiteralAst(1)])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 1.0))
     ast = ProgAst([LiteralAst(1), LiteralAst(2)])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 2.0))
     ast = AssignAst(LiteralAst(1), LiteralAst("a"))
     with self.assertRaises(Exception):
         evaluate(ast, Environment(), lambda value: value)
     ast = ProgAst([AssignAst(VarAst('a'), LiteralAst("foo")), VarAst('a')])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, "foo"))
     ast = AssignAst(VarAst("a"), LiteralAst("foo"))
     with self.assertRaises(Exception):
         evaluate(ast, Environment(Environment()), lambda value: value)
     ast = CallAst(
         LambdaAst("", ["a"], VarAst("a")),
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 1.0))
     ast = CallAst(LambdaAst("", ["a"], VarAst("a")), [LiteralAst("abc")])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, "abc"))
     # # (λ loop (n) if n > 0 then n + loop(n - 1) else 0) (10)
     ast = CallAst(
             "loop", ["n"],
                 BinaryAst(">", VarAst("n"), LiteralAst(0)),
                     "+", VarAst("n"),
                             [BinaryAst('-', VarAst('n'), LiteralAst(1))])),
                 LiteralAst(0))), [LiteralAst(10)])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 55.0))
     # # let (x) x;
     ast = LetAst([VarDefAst("x", None)], VarAst("x"))
     evaluate(ast, Environment(), self.assertFalse)
     # # let (x = 2, y = x + 1, z = x + y) x + y + z
     ast = LetAst([
         VarDefAst("x", LiteralAst(2)),
         VarDefAst("y", BinaryAst("+", VarAst("x"), LiteralAst(1))),
         VarDefAst("z", BinaryAst("+", VarAst("x"), VarAst("y")))
     ], BinaryAst("+", BinaryAst("+", VarAst("x"), VarAst("y")),
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 10.0))
     # # the second expression will result an errors,
     # since x, y, z are bound to the let body
     # # let (x = 2, y = x + 1, z = x + y) x + y + z; x + y + z
     ast = ProgAst([
             VarDefAst('x', LiteralAst(2)),
             VarDefAst('y', BinaryAst('+', VarAst('x'), LiteralAst(1))),
             VarDefAst('z', BinaryAst('+', VarAst('x'), VarAst('y')))
                BinaryAst('+', BinaryAst('+', VarAst('x'), VarAst('y')),
         BinaryAst('+', BinaryAst('+', VarAst('x'), VarAst('y')),
     with self.assertRaises(Exception):
         evaluate(ast, Environment(), lambda value: value)
     ast = IfAst(LiteralAst(""), LiteralAst(1), None)
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 1.0))
     ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(2))
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 2.0))
     ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(False))
     evaluate(ast, Environment(), self.assertFalse)
     ast = {"type": "foo", "value": 'foo'}
     with self.assertRaises(Exception):
         evaluate(ast, Environment(), lambda value: value)
     # fib = λ(n) if n < 2 then n else fib(n - 1) + fib(n - 2);
     # fib(6);
     ast = ProgAst([
                 'n', ['n'],
                     BinaryAst('<', VarAst('n'), LiteralAst(2)),
                             [BinaryAst('-', VarAst('n'), LiteralAst(1))]),
                             [BinaryAst('-', VarAst('n'), LiteralAst(2))
         CallAst(VarAst('fib'), [LiteralAst(6)])
     evaluate(ast, Environment(),
              lambda value: self.assertEqual(value, 8.0))
     ast = IfAst(LiteralAst(False), LiteralAst(1), LiteralAst(False))
     evaluate(ast, Environment(), self.assertFalse)
     ast = CallAst(LiteralAst(1), [])
     with self.assertRaises(Exception):
         evaluate(ast, Environment(), self.assertFalse)
     code = """
     2 + twice(3, 4)
     global_env = Environment()
     for name, func in primitive.items():
         global_env.define(name, func)
     parser = Parser(TokenStream(InputStream(code)))
     evaluate(parser(), global_env, lambda result: result)
        if token.type != 'op':
            return left
        his_prec = self.PRECEDENCE[token.value]
        if his_prec > my_prec:
            right = self._maybe_binary(self._parse_atom(), his_prec)
            if token.value == '=':
                binary = AssignAst(left, right)
                binary = BinaryAst(token.value, left, right)
            return self._maybe_binary(binary, my_prec)
        return left

    def __call__(self) -> ProgAst:
        return self._parse_toplevel()

    def unexpected(self):
        raise exception with error msg and error location
        whenever encountered error.
            f'Unexpected token: {self._token_stream.peek()}')

if __name__ == '__main__':
    with open(sys.argv[1]) as f:
        code = f.read()
    ast = Parser(TokenStream(InputStream(code)))()
    def test_read_ident(self):
        token_stream = TokenStream(InputStream('a=1'))
        result = token_stream._read_identifier()
        self.assertEqual(Token('var', 'a=1'), result)

        token_stream = TokenStream(InputStream('a = 1'))
        result = token_stream._read_identifier()
        self.assertEqual(Token('var', 'a'), result)

        token_stream = TokenStream(InputStream('let(a = 1'))
        result = token_stream._read_identifier()
        self.assertEqual(Token('kw', 'let'), result)

        token_stream = TokenStream(InputStream('js "aaa"'))
        result = token_stream._read_identifier()
        self.assertEqual(Token('kw', 'js'), result)