Пример #1
0
 def get_name(kind):
     """ used in reporting errors in match() on parsing stage """
     if (kind >= 0 and kind < len(Token.token_types)):
         return Token.token_types[kind]
     raise ScannerException("Index out of bounds. Unknown token " +
                            str(kind))
     return None
Пример #2
0
 def match(self,tokval):
     """ if match return value of token """
     if self.peek().kind != tokval:
         ##print self
         raise ScannerException("cannot find token "+ \
                             Token.get_name(tokval) + " got " \
                             + str(self.peek())  \
                             + " instead!")
     return self.dequeue()
Пример #3
0
    def get_lexeme(self, chunks, pos):
        if chunks == None:
            return None
        if (self.debug): print("chunks", chunks)
        if chunks == "பதிப்பி":
            tval = EzhilLexeme(chunks, EzhilToken.PRINT)
        elif chunks == "தேர்ந்தெடு":
            tval = EzhilLexeme(chunks, EzhilToken.SWITCH)
        elif chunks == "தேர்வு":
            tval = EzhilLexeme(chunks, EzhilToken.CASE)
        elif chunks == "ஏதேனில்":
            tval = EzhilLexeme(chunks, EzhilToken.OTHERWISE)
        elif chunks == "ஆனால்":
            tval = EzhilLexeme(chunks, EzhilToken.IF)
        elif chunks == "இல்லைஆனால்":
            tval = EzhilLexeme(chunks, EzhilToken.ELSEIF)
        elif chunks == "இல்லை":
            tval = EzhilLexeme(chunks, EzhilToken.ELSE)
        elif chunks == "ஆக":
            tval = EzhilLexeme(chunks, EzhilToken.FOR)
        elif chunks == "ஒவ்வொன்றாக":
            tval = EzhilLexeme(chunks, EzhilToken.FOREACH)
        elif chunks == "இல்":
            tval = EzhilLexeme(chunks, EzhilToken.COMMA)
        elif chunks == "வரை":
            tval = EzhilLexeme(chunks, EzhilToken.WHILE)
        elif chunks == "செய்":
            tval = EzhilLexeme(chunks, EzhilToken.DO)
        elif chunks == "முடியேனில்":
            tval = EzhilLexeme(chunks, EzhilToken.DOWHILE)
        elif chunks == "பின்கொடு":
            tval = EzhilLexeme(chunks, EzhilToken.RETURN)
        elif chunks == "முடி":
            tval = EzhilLexeme(chunks, EzhilToken.END)
        elif chunks == "நிரல்பாகம்":
            tval = EzhilLexeme(chunks, EzhilToken.DEF)
        elif chunks == "தொடர்":
            tval = EzhilLexeme(chunks, EzhilToken.CONTINUE)
        elif chunks == "நிறுத்து":
            tval = EzhilLexeme(chunks, EzhilToken.BREAK)
        elif chunks == "@":
            tval = EzhilLexeme(chunks, EzhilToken.ATRATEOF)
        elif chunks == "=":
            tval = EzhilLexeme(chunks, EzhilToken.EQUALS)
        elif chunks == "-":
            tval = EzhilLexeme(chunks, EzhilToken.MINUS)
        elif chunks == "+":
            tval = EzhilLexeme(chunks, EzhilToken.PLUS)
        elif chunks == ">":
            tval = EzhilLexeme(chunks, EzhilToken.GT)
        elif chunks == "<":
            tval = EzhilLexeme(chunks, EzhilToken.LT)
        elif chunks == ">=":
            tval = EzhilLexeme(chunks, EzhilToken.GTEQ)
        elif chunks == "<=":
            tval = EzhilLexeme(chunks, EzhilToken.LTEQ)
        elif chunks == "==":
            tval = EzhilLexeme(chunks, EzhilToken.EQUALITY)
        elif chunks == "!=":
            tval = EzhilLexeme(chunks, EzhilToken.NEQ)
        elif chunks == "*":
            tval = EzhilLexeme(chunks, EzhilToken.PROD)
        elif chunks == "/":
            tval = EzhilLexeme(chunks, EzhilToken.DIV)
        elif chunks == ",":
            tval = EzhilLexeme(chunks, EzhilToken.COMMA)
        elif chunks == "(":
            tval = EzhilLexeme(chunks, EzhilToken.LPAREN)
        elif chunks == ")":
            tval = EzhilLexeme(chunks, EzhilToken.RPAREN)
        elif chunks == "[":
            tval = EzhilLexeme(chunks, EzhilToken.LSQRBRACE)
        elif chunks == "]":
            tval = EzhilLexeme(chunks, EzhilToken.RSQRBRACE)
        elif chunks == "{":
            tval = Lexeme(chunks, Token.LCURLBRACE)
        elif chunks == "}":
            tval = Lexeme(chunks, Token.RCURLBRACE)
        elif chunks == ":":
            tval = Lexeme(chunks, Token.COLON)
        elif chunks == "%":
            tval = EzhilLexeme(chunks, EzhilToken.MOD)
        elif chunks == "^":
            tval = EzhilLexeme(chunks, EzhilToken.EXP)
        elif chunks == "&&":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_AND)
        elif chunks == "&":
            tval = Lexeme(chunks, EzhilToken.BITWISE_AND)
        elif chunks == "||":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_OR)
        elif chunks == "|":
            tval = Lexeme(chunks, EzhilToken.BITWISE_OR)
        elif chunks == "!":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_NOT)
        elif (chunks[0] == "\"" and chunks[-1] == "\""):
            tval = EzhilLexeme(chunks[1:-1], EzhilToken.STRING)
        elif isdigit(chunks[0]) or chunks[0] == '+' or chunks[0] == '-':
            #tval=EzhilLexeme(float(chunks),EzhilToken.NUMBER)
            # deduce a float or integer
            if (chunks.find('.') >= 0 or chunks.find('e') >= 0
                    or chunks.find('E') >= 0):
                tval = EzhilLexeme(float(chunks), EzhilToken.NUMBER)
            else:
                tval = EzhilLexeme(int(chunks), EzhilToken.NUMBER)
        elif isalpha(chunks[0]) or has_tamil(chunks) or chunks[0] == '_':
            ## check for tamil/english/mixed indentifiers even starting with a lead '_'
            tval = EzhilLexeme(chunks, EzhilToken.ID)
        else:
            raise ScannerException("Lexical error: " + str(chunks) +
                                   " at Line , Col " +
                                   str(self.get_line_col(pos)) + " in file " +
                                   self.fname)

        [l, c] = self.get_line_col(pos)
        tval.set_line_col([l, c])
        tval.set_file_name(self.fname)
        self.tokens.append(tval)

        if (self.debug): print("Lexer token = ", str(tval))

        return l
Пример #4
0
    def tokenize(self, data=None):
        """ do hard-work of tokenizing and
        put EzhilLexemes into the tokens[] Q """
        if (self.stdin_mode):
            if (self.debug): print(self.tokens)
            ## cleanup the Q for stdin_mode of any EOF that can remain.
            if (len(self.tokens) != 0):
                self.match(EzhilToken.EOF)
            if (len(self.tokens) != 0):
                raise ScannerException(
                    "Lexer: token Q has previous session tokens ")
            self.tokens = list()
        else:
            data = "".join(self.File.readlines())
        if (self.debug): print(data)
        idx = 0
        tok_start_idx = 0

        while (idx < len(data)):
            c = data[idx]
            if (c == ' ' or c == '\t' or c == '\n'):
                if (c == '\n'):
                    ##actual col = idx - col_idx
                    self.update_line_col(idx)
                idx = idx + 1
            elif (c == '\r'):
                idx = idx + 1
                continue
            elif (c == '#'):
                ## single line skip comments like Python/Octave
                start = idx
                while (idx < len(data) and not (data[idx] in ['\r', '\n'])):
                    idx = idx + 1
                if (data[idx] == '\r'):
                    idx = idx + 1
                end = idx
                self.comments[self.line] = data[start:end]
            elif (isdigit(c)):  #or c == '+' or c == '-'  ):
                num = c
                tok_start_idx = idx
                idx = idx + 1
                ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz
                ## instead. also may throw an error if we exceed
                ## buffer-length.
                if (c in ['+', '-'] and (idx < len(data))
                        and not isdigit(data[idx])):
                    self.get_lexeme(c, idx)
                    continue
                in_sci_notation = False
                while ((idx < len(data))
                       and (isdigit(data[idx])
                            or data[idx] in ['+', '-', 'e', 'E', '.'])):
                    if (data[idx] in ['+', '-'] and not in_sci_notation):
                        break
                    elif (data[idx] in ['e', 'E']):
                        in_sci_notation = True
                    num = num + data[idx]
                    idx = idx + 1
                self.get_lexeme(num, tok_start_idx)
            elif (c == "\""):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while (idx < len(data) and (data[idx] != '\"')):
                    if (data[idx] == '\\'):
                        idx = idx + 1
                        if (data[idx] == 'n'):
                            s = s + '\n'
                        elif (data[idx] == 't'):
                            s = s + '\t'
                        else:
                            s = s + data[idx]
                    else:
                        s = s + data[idx]
                    idx = idx + 1
                s = s + data[idx]
                idx = idx + 1
                self.get_lexeme(s, tok_start_idx)
            elif (istamil(c) or isalpha(c) or c == '_'):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while (
                    (idx < len(data)) and
                    (not data[idx] in EzhilToken.FORBIDDEN_FOR_IDENTIFIERS)):
                    s = s + data[idx]
                    idx = idx + 1
                self.get_lexeme(s, tok_start_idx)
            elif (c in self.unary_binary_ops):
                tok_start_idx = idx
                if (len(data) > (1 + idx)
                        and data[idx + 1] in ['=', '|', '&']):
                    c = c + data[idx + 1]
                    idx = idx + 1
                self.get_lexeme(c, tok_start_idx)
                idx = idx + 1
            elif c == ";":
                # treat as newline
                idx = idx + 1
                continue
            else:
                tok_start_idx = idx
                idx = idx + 1
                self.get_lexeme(c, tok_start_idx)

        tok_start_idx = idx
        ## close the file if not stdin_mode
        if (not self.stdin_mode): self.File.close()

        ## and manually add an EOF statement.
        eof_tok = EzhilLexeme("", EzhilToken.EOF)
        eof_tok.set_line_col(self.get_line_col(tok_start_idx))
        self.tokens.append(eof_tok)
        if (self.debug):
            print("before reverse")
            self.dump_tokens()
        self.tokens.reverse()
        if (self.debug):
            print("after reverse")
            self.dump_tokens()
        return
Пример #5
0
 def peek(self):
     """ remove Lexeme from the head of Q""" 
     if len(self.tokens) == 0:
         raise ScannerException("tokens[] queue is empty ")
     ##print "**> PEEK-ing, ",self.tokens[-1]
     return self.tokens[-1]
Пример #6
0
    def tokenize(self,data=None):
        """ do hard-work of tokenizing and
        put Lexemes into the tokens[] Q """
        if ( self.stdin_mode ):
            if ( self.debug ): print(self.tokens)
            ## cleanup the Q for stdin_mode of any EOF that can remain.
            if ( len(self.tokens) != 0 ):
                self.match( Token.EOF )
            if( len(self.tokens) != 0 ):
                raise ScannerException("Lexer: token Q has previous session tokens ")
            self.tokens = list()
        else:
            data = "".join(self.File.readlines())
        
        idx = 0 
        tok_start_idx = 0

        while ( idx < len( data ) ):
            c = data[idx]
            
            if  ( c == ' 'or c == '\t' or c == '\n' ):
                if ( c == '\n' ):
                    ##actual col = idx - col_idx
                    self.update_line_col(idx)
                idx = idx + 1
            elif ( c == '#' ):
                ## single line skip comments like Python/Octave
                while ( idx < len( data ) and data[idx] !='\n' ):
                    idx = idx + 1                    
            elif ( isdigit(c) or c == '+' or c == '-'  ):
                num = c
                tok_start_idx = idx
                idx = idx + 1
                ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz 
                ## instead. also may throw an error if we exceed 
                ## buffer-length.                
                if ( c in ['+','-']  and ( idx < len( data ) ) 
                     and not isdigit(data[idx]) ):
                    self.get_lexeme( c , idx )
                    continue
                while ( ( idx < len( data) )
                            and ( isdigit(data[idx]) or data[idx] == '.') ):
                    num = num + data[idx]
                    idx = idx + 1
                self.get_lexeme( num , tok_start_idx  )
            elif ( c == "\"" ):
                tok_start_idx = idx 
                s = c
                idx = idx + 1
                while ( idx < len( data ) and
                         ( data[idx] != '\"' ) ):
                    s = s + data[idx]
                    if ( data[idx] == '\\' ):
                        idx = idx + 1
                    idx  = idx + 1
                s = s+data[idx]
                idx  = idx + 1
                self.get_lexeme( s , tok_start_idx )
            elif ( isalpha( c ) ):
                tok_start_idx = idx 
                s = c
                idx = idx + 1
                while ( ( idx < len( data ) )
                            and ( isalpha(data[idx]) or isdigit( data[idx] )
                                  or data[idx] in [ "\"", "_" ] ) ):
                    s = s + data[idx]
                    idx = idx + 1
                self.get_lexeme( s , tok_start_idx )
            elif ( c in self.unary_binary_ops ):
                tok_start_idx = idx 
                if ( len(data) > ( 1 + idx  ) 
                     and data[idx+1] in ['=','|','&']  ):
                    c = c +data[idx+1]
                    idx = idx + 1
                self.get_lexeme(  c , tok_start_idx )
                idx = idx + 1
            else:
                tok_start_idx = idx 
                idx = idx + 1
                self.get_lexeme( c , tok_start_idx )
        
        tok_start_idx = idx 

        ## close the file if not stdin_mode
        if ( not self.stdin_mode ): self.File.close()

        ## and manually add an EOF statement.
        eof_tok = Lexeme("",Token.EOF )
        eof_tok.set_line_col( self.get_line_col( tok_start_idx ) )
        self.tokens.append( eof_tok )

        self.tokens.reverse()
        return 
Пример #7
0
    def get_lexeme(self,chunks , pos):
        if chunks == None:
            return None

        if chunks == "print":
            tval=Lexeme(chunks,Token.PRINT)
        elif chunks == "if":
            tval = Lexeme( chunks, Token.IF )
        elif chunks == "elseif":
            tval = Lexeme( chunks, Token.ELSEIF )
        elif chunks == "else":
            tval = Lexeme( chunks, Token.ELSE )
        elif chunks == "for":
            tval = Lexeme( chunks, Token.FOR )
        elif chunks == "while":
            tval = Lexeme( chunks, Token.WHILE )
        elif chunks == "do":
            tval = Lexeme( chunks, Token.DO )
        elif chunks == "return":
            tval=Lexeme(chunks,Token.RETURN)
        elif chunks == "end":
            tval=Lexeme(chunks,Token.END)
        elif chunks == "def":
            tval=Lexeme(chunks,Token.DEF)
        elif chunks == "continue":
            tval=Lexeme(chunks,Token.CONTINUE)
        elif chunks == "break":
            tval=Lexeme(chunks,Token.BREAK)
        elif chunks == "=":
            tval=Lexeme(chunks,Token.EQUALS)
        elif chunks == "-":
            tval=Lexeme(chunks,Token.MINUS)
        elif chunks == "+":
            tval=Lexeme(chunks,Token.PLUS)
        elif chunks == ">":
            tval=Lexeme(chunks,Token.GT)
        elif chunks == "<":
            tval=Lexeme(chunks,Token.LT)
        elif chunks == ">=":
            tval=Lexeme(chunks,Token.GTEQ)
        elif chunks == "<=":
            tval=Lexeme(chunks,Token.LTEQ)
        elif chunks == "==":
            tval=Lexeme(chunks,Token.EQUALITY)
        elif chunks == "!=":
            tval=Lexeme(chunks,Token.NEQ)
        elif chunks == "*":
            tval=Lexeme(chunks,Token.PROD)
        elif chunks == "/":
            tval=Lexeme(chunks,Token.DIV)
        elif chunks == ",":
            tval=Lexeme(chunks,Token.COMMA)
        elif chunks == "(":
            tval=Lexeme(chunks,Token.LPAREN)
        elif chunks == ")":
            tval=Lexeme(chunks,Token.RPAREN)
        elif chunks == "[":
            tval=Lexeme(chunks,Token.LSQRBRACE)
        elif chunks == "]":
            tval=Lexeme(chunks,Token.RSQRBRACE)
        elif chunks == "{":
            tval=Lexeme(chunks,Token.LCURLBRACE)
        elif chunks == "}":
            tval=Lexeme(chunks,Token.RCURLBRACE)
        elif chunks == ":":
            tval=Lexeme(chunks,Token.COLON)
        elif chunks == "%":
            tval=Lexeme(chunks,Token.MOD)
        elif chunks == "^":
            tval=Lexeme(chunks,Token.EXP)
        elif chunks == "&&":
            tval=Lexeme(chunks,Token.LOGICAL_AND)
        elif chunks == "&":
            tval=Lexeme(chunks,Token.BITWISE_AND)
        elif chunks == "||":
            tval=Lexeme(chunks,Token.LOGICAL_OR)
        elif chunks == "|":
            tval=Lexeme(chunks,Token.BITWISE_OR)
        elif ( chunks[0] == "\"" and chunks[-1] == "\"" ):
            tval = Lexeme( chunks[1:-1], Token.STRING )
        elif isdigit(chunks[0]) or chunks[0]=='+' or chunks[0]=='-':
            # deduce a float or integer
            if ( chunks.find('.') >= 0 or chunks.find('e') >= 0 or chunks.find('E') >= 0 ):
                tval=Lexeme(float(chunks),Token.NUMBER)
            else:
                tval=Lexeme(int(chunks),Token.NUMBER)
            
        elif isalpha(chunks[0]):
            tval=Lexeme(chunks,Token.ID)
        else:
            raise ScannerException("Lexical error: " + str(chunks) + " at Line , Col "+str(self.get_line_col( pos )) +" in file "+self.fname )
        
        [l,c]=self.get_line_col( pos )
        tval.set_line_col( [l,c] )
        tval.set_file_name( self.fname )
        self.tokens.append( tval )
        return l
Пример #8
0
    def get_lexeme(self, chunks, pos):
        if (self.debug):
            print(u"get_lexeme", chunks, pos)

        if chunks == None:
            return None

        if chunks == u"பதிப்பி":
            tval = EzhilLexeme(chunks, EzhilToken.PRINT)
        elif chunks == u"தேர்ந்தெடு":
            tval = EzhilLexeme(chunks, EzhilToken.SWITCH)
        elif chunks == u"தேர்வு":
            tval = EzhilLexeme(chunks, EzhilToken.CASE)
        elif chunks == u"ஏதேனில்":
            tval = EzhilLexeme(chunks, EzhilToken.OTHERWISE)
        elif chunks == u"ஆனால்":
            tval = EzhilLexeme(chunks, EzhilToken.IF)
        elif chunks == u"இல்லைஆனால்":
            tval = EzhilLexeme(chunks, EzhilToken.ELSEIF)
        elif chunks == u"இல்லை":
            tval = EzhilLexeme(chunks, EzhilToken.ELSE)
        elif chunks == u"ஆக":
            tval = EzhilLexeme(chunks, EzhilToken.FOR)
        elif chunks == u"ஒவ்வொன்றாக":
            tval = EzhilLexeme(chunks, EzhilToken.FOREACH)
        elif chunks == u"இல்":
            tval = EzhilLexeme(chunks, EzhilToken.COMMA)
        elif chunks == u"வரை":
            tval = EzhilLexeme(chunks, EzhilToken.WHILE)
        elif chunks == u"செய்":
            tval = EzhilLexeme(chunks, EzhilToken.DO)
        elif chunks == u"முடியேனில்":
            tval = EzhilLexeme(chunks, EzhilToken.DOWHILE)
        elif chunks == u"பின்கொடு":
            tval = EzhilLexeme(chunks, EzhilToken.RETURN)
        elif chunks == u"முடி":
            tval = EzhilLexeme(chunks, EzhilToken.END)
        elif chunks == u"நிரல்பாகம்":
            tval = EzhilLexeme(chunks, EzhilToken.DEF)
        elif chunks == u"தொடர்":
            tval = EzhilLexeme(chunks, EzhilToken.CONTINUE)
        elif chunks == u"நிறுத்து":
            tval = EzhilLexeme(chunks, EzhilToken.BREAK)
        elif chunks == u"@":
            tval = EzhilLexeme(chunks, EzhilToken.ATRATEOF)
        elif chunks == u"=":
            tval = EzhilLexeme(chunks, EzhilToken.EQUALS)
        elif chunks == u"-":
            tval = EzhilLexeme(chunks, EzhilToken.MINUS)
        elif chunks == u"+":
            tval = EzhilLexeme(chunks, EzhilToken.PLUS)
        elif chunks == u">":
            tval = EzhilLexeme(chunks, EzhilToken.GT)
        elif chunks == u"<":
            tval = EzhilLexeme(chunks, EzhilToken.LT)
        elif chunks == u">=":
            tval = EzhilLexeme(chunks, EzhilToken.GTEQ)
        elif chunks == u"<=":
            tval = EzhilLexeme(chunks, EzhilToken.LTEQ)
        elif chunks == u"==":
            tval = EzhilLexeme(chunks, EzhilToken.EQUALITY)
        elif chunks == u"!=":
            tval = EzhilLexeme(chunks, EzhilToken.NEQ)
        elif chunks == u"*":
            tval = EzhilLexeme(chunks, EzhilToken.PROD)
        elif chunks == u"/":
            tval = EzhilLexeme(chunks, EzhilToken.DIV)
        elif chunks == u",":
            tval = EzhilLexeme(chunks, EzhilToken.COMMA)
        elif chunks == u"(":
            tval = EzhilLexeme(chunks, EzhilToken.LPAREN)
        elif chunks == u")":
            tval = EzhilLexeme(chunks, EzhilToken.RPAREN)
        elif chunks == u"[":
            tval = EzhilLexeme(chunks, EzhilToken.LSQRBRACE)
        elif chunks == u"]":
            tval = EzhilLexeme(chunks, EzhilToken.RSQRBRACE)
        elif chunks == u"{":
            tval = Lexeme(chunks, Token.LCURLBRACE)
        elif chunks == u"}":
            tval = Lexeme(chunks, Token.RCURLBRACE)
        elif chunks == u":":
            tval = Lexeme(chunks, Token.COLON)
        elif chunks == u"%":
            tval = EzhilLexeme(chunks, EzhilToken.MOD)
        elif chunks == u"^":
            tval = EzhilLexeme(chunks, EzhilToken.EXP)
        elif chunks == u"&&":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_AND)
        elif chunks == u"&":
            tval = Lexeme(chunks, EzhilToken.BITWISE_AND)
        elif chunks == u"||":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_OR)
        elif chunks == u"|":
            tval = Lexeme(chunks, EzhilToken.BITWISE_OR)
        elif chunks == u"!":
            tval = Lexeme(chunks, EzhilToken.LOGICAL_NOT)
        elif (chunks[0] == u"\"" and chunks[-1] == u"\""):
            tval = EzhilLexeme(chunks[1:-1], EzhilToken.STRING)
        elif chunks[0].isdigit() or chunks[0] == '+' or chunks[0] == '-':
            #tval=EzhilLexeme(float(chunks),EzhilToken.NUMBER)
            # deduce a float or integer
            if (chunks.find(u'.') >= 0 or chunks.find(u'e') >= 0
                    or chunks.find(u'E') >= 0):
                tval = EzhilLexeme(float(chunks), EzhilToken.NUMBER)
            else:
                tval = EzhilLexeme(int(chunks), EzhilToken.NUMBER)
        else:
            ## check for tamil/english/mixed indentifiers even starting with a lead '_'
            match_obj = re.match(EzhilToken.RE_ALPHA_NUMERIC_, chunks)
            if match_obj:
                if len(match_obj.group(0)) != len(chunks):
                    raise ScannerException(
                        u"Lexical error: Invalid identifier name '" +
                        unicode(chunks) + u"' at Line , Col " +
                        unicode(self.get_line_col(pos)) + u" in file " +
                        self.fname)
                tval = EzhilLexeme(chunks, EzhilToken.ID)
            else:
                raise ScannerException(u"Lexical error: " + unicode(chunks) +
                                       u" at Line , Col " +
                                       unicode(self.get_line_col(pos)) +
                                       u" in file " + self.fname)

        [l, c] = self.get_line_col(pos)
        tval.set_line_col([l, c])
        tval.set_file_name(self.fname)
        self.tokens.append(tval)

        if (self.debug): print(u"Lexer token = ", tval)
        return l
Пример #9
0
    def tokenize(self, data=None):
        """ do hard-work of tokenizing and
        put EzhilLexemes into the tokens[] Q """
        if (self.debug): print(u"Start of Ezhil lexer - begin tokenize")
        if (self.stdin_mode):
            if (self.debug): print(self.tokens)
            ## cleanup the Q for stdin_mode of any EOF that can remain.
            if (len(self.tokens) != 0):
                self.match(EzhilToken.EOF)
            if (len(self.tokens) != 0):
                raise ScannerException(
                    "Lexer: token Q has previous session tokens ")
            self.tokens = list()
        else:
            if hasattr(self.File, 'data'):
                data = self.File.data
            else:
                data = u"".join(self.File.readlines())
        if (self.debug): print(data)
        idx = 0
        tok_start_idx = 0

        while (idx < len(data)):
            c = data[idx]
            if (self.debug): print(idx, c)
            if (istamil(c) or c.isalpha() or c == u'_'):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while ((idx < len(data))
                       and self.is_allowed_for_identifier(data[idx])):
                    s = s + data[idx]
                    idx = idx + 1
                if idx < len(data) and not data[idx].isspace():
                    if data[idx] in ['#', '$', '@', '\'', '"']:
                        raise ScannerException(
                            "Lexer: token %s is not valid for identifier, with prefix %s"
                            % (data[idx], s))
                self.get_lexeme(s, tok_start_idx)
            elif (c.isspace()):  # or c in u' 'or c == u'\t' or c == u'\n'
                if (c == u'\n'):
                    ##actual col = idx - col_idx
                    self.update_line_col(idx)
                idx = idx + 1
            elif (c == u'\r'):
                idx = idx + 1
                continue
            elif (c == u'#'):
                ## single line skip comments like Python/Octave
                start = idx
                while (idx < len(data) and not (data[idx] in [u'\r', u'\n'])):
                    idx = idx + 1
                if (idx < len(data) and data[idx] == u'\r'):
                    idx = idx + 1
                end = idx
                self.comments[self.line] = data[start:end]
            elif (c.isdigit()):  #or c == '+' or c == '-'  ):
                num = c
                tok_start_idx = idx
                idx = idx + 1
                ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz
                ## instead. also may throw an error if we exceed
                ## buffer-length.
                if (c in [u'+', u'-'] and (idx < len(data))
                        and not data[idx].isdigit()):
                    self.get_lexeme(c, idx)
                    continue
                in_sci_notation = False
                while ((idx < len(data))
                       and (data[idx].isdigit()
                            or data[idx] in [u'+', u'-', u'e', u'E', u'.'])):
                    if (data[idx] in [u'+', u'-'] and not in_sci_notation):
                        break
                    elif (data[idx] in [u'e', u'E']):
                        in_sci_notation = True
                    num = num + data[idx]
                    idx = idx + 1
                self.get_lexeme(num, tok_start_idx)
            elif (c == u"\""):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while (idx < len(data) and (data[idx] != u'\"')):
                    if (data[idx] == u'\\'):
                        idx = idx + 1
                        if (data[idx] == u'n'):
                            s = s + u'\n'
                        elif (data[idx] == u't'):
                            s = s + u'\t'
                        else:
                            s = s + data[idx]
                    else:
                        s = s + data[idx]
                    idx = idx + 1
                s = s + data[idx]
                idx = idx + 1
                self.get_lexeme(s, tok_start_idx)
            elif (c in self.unary_binary_ops):
                tok_start_idx = idx
                if (len(data) > (1 + idx)
                        and data[idx + 1] in [u'=', u'|', u'&']):
                    c = c + data[idx + 1]
                    idx = idx + 1
                self.get_lexeme(c, tok_start_idx)
                idx = idx + 1
            elif c == u";":
                # treat as newline
                idx = idx + 1
                continue
            else:
                tok_start_idx = idx
                idx = idx + 1
                self.get_lexeme(c, tok_start_idx)

        tok_start_idx = idx
        ## close the file if not stdin_mode
        if (not self.stdin_mode): self.File.close()

        ## and manually add an EOF statement.
        eof_tok = EzhilLexeme("", EzhilToken.EOF)
        eof_tok.set_line_col(self.get_line_col(tok_start_idx))
        self.tokens.append(eof_tok)
        if (self.debug):
            print(u"before reverse")
            self.dump_tokens()
        self.tokens.reverse()
        if (self.debug):
            print(u"after reverse")
            self.dump_tokens()
        return