Exemplo n.º 1
0
    def getNextToken(self):

        if not self._instream:
            return None

        if self._stack:
            return self._stack.pop()
        
        consumed = ""
        startLine = 0
        startColumn = 0
        
        if not self._inputBuffer:
            # Puffer erzeugen. Größe auf Zwei setzen, um evtl. Escape-Zeichen erkennen zu können
            self._inputBuffer = InputBuffer(self._instream, fillSize=2) 
                
        while True:
            
            content = self._inputBuffer.getContent()
                        
            if not content:
                break
            
            consumedChars, isTermination = self._consume()
            curStartPos = self._updatePosInfo(consumedChars)

            if self._mode == LexerMode.NORMAL:

                if not consumed and curStartPos:
                    startLine = curStartPos[0]
                    startColumn = curStartPos[1]

                consumed += consumedChars
                        
            if isTermination:
                
                if self._mode == LexerMode.NORMAL:
                    
                    res = self._handleComsumption(consumed, startLine, startColumn)
                    if res:
                        return res
                    else:
                        consumed = ""
                                    
                else: # Kommentarmodus => Moduswechsel mit Content prüfen
                    
                    self._checkForModeChange(content)
                
        if consumed:
            return self._handleComsumption(consumed, startLine, startColumn)
        else:
            return None
Exemplo n.º 2
0
 def _checkForModeChange(self,consumed):
     
     res = consumed
             
     if not consumed:
         return res
             
     if self._mode == LexerMode.NORMAL:
             
         if self._lineCommentEnabled and \
            self._matchesCommentBegin(consumed, self._lineCommentStart):
             
             self._mode = LexerMode.LINE_COMMENT
             self._inputBuffer = InputBuffer(self._instream,
                                             fillSize = 1 # <- Länge von '\n'
                                             )
             
             res = ""
                 
         elif self._blockCommentEnabled and \
              self._matchesCommentBegin(consumed, self._blockCommentStart):
             
             self._mode = LexerMode.BLOCK_COMMENT
             size = len(self._blockCommentEnd);
             self._inputBuffer = InputBuffer(self._instream,
                                             fillSize = size
                                             )
             res = ""
             
     else:
         
         self._mode = LexerMode.NORMAL
         self._inputBuffer = InputBuffer(self._instream,
                                         fillSize = 2 # <- Länge zwei wg. Escape-Zeichen
                                         )
         res = ""
     
     return res
Exemplo n.º 3
0
class Lexer(object):
    
    def __init__(self):
        
        self._instream = None
        self._stack = []
        self._keywords = {}
        self._words = []
        self._prefixes = []
        self._postfixes = []
        self._separators = []
        self._literal = None
        self._literalDelims = []
        self._literalEscChar = None
        self._currentLitDelim = ''
        self._wsCharCodes = [
                             WSCharCode.TAB,
                             WSCharCode.LINEBREAK,
                             WSCharCode.VTAB,
                             WSCharCode.FORMFEED,
                             WSCharCode.SPACE
                             ]
        self._mode = LexerMode.NORMAL
        self._lineCommentEnabled = False
        self._lineCommentStart = ''
        self._blockCommentEnabled = False
        self._blockCommentStart = ''
        self._blockCommentEnd = ''
        
        self._line = 1
        self._column = 0

    def setInputStream(self, instream):

        self._instream = instream
        self._reset()
    
    def _reset(self):

        self._stack = []
        self._inputBuffer = None
        self._mode = LexerMode.NORMAL
        self._line = 1
        self._column = 0

    def addTokenType(self, tt):
        
        if isinstance(tt, Keyword):
            self._keywords[tt.getKeyword()] = tt
        elif isinstance(tt, Word):
            self._words.append(tt)
        elif isinstance(tt, Prefix):
            self._prefixes.append(tt)
            self._prefixes.sort(cmp=TokenType.compare)
        elif isinstance(tt, Postfix):
            self._postfixes.append(tt)
            self._postfixes.sort(cmp=TokenType.compare)
        elif isinstance(tt, Separator):
            self._separators.append(tt)
            self._separators.sort(cmp=TokenType.compare)
        elif isinstance(tt, Literal):
            self._literal = tt
            self._literalDelims = tt.DELIMITERS
            self._literalEscChar = Literal.ESCAPE_CHAR
        else:
            raise Exception('Unknown token type')

    def enableLineComments(self, lineCommentStart='//'):
        
        self._lineCommentEnabled = True
        self._lineCommentStart = lineCommentStart
           
    def enableBlockComments(self, 
                            blockCommentStart='/*', 
                            blockCommentEnd='*/'):

        self._blockCommentEnabled = True
        self._blockCommentStart = blockCommentStart
        self._blockCommentEnd = blockCommentEnd

    def getNextToken(self):

        if not self._instream:
            return None

        if self._stack:
            return self._stack.pop()
        
        consumed = ""
        startLine = 0
        startColumn = 0
        
        if not self._inputBuffer:
            # Puffer erzeugen. Größe auf Zwei setzen, um evtl. Escape-Zeichen erkennen zu können
            self._inputBuffer = InputBuffer(self._instream, fillSize=2) 
                
        while True:
            
            content = self._inputBuffer.getContent()
                        
            if not content:
                break
            
            consumedChars, isTermination = self._consume()
            curStartPos = self._updatePosInfo(consumedChars)

            if self._mode == LexerMode.NORMAL:

                if not consumed and curStartPos:
                    startLine = curStartPos[0]
                    startColumn = curStartPos[1]

                consumed += consumedChars
                        
            if isTermination:
                
                if self._mode == LexerMode.NORMAL:
                    
                    res = self._handleComsumption(consumed, startLine, startColumn)
                    if res:
                        return res
                    else:
                        consumed = ""
                                    
                else: # Kommentarmodus => Moduswechsel mit Content prüfen
                    
                    self._checkForModeChange(content)
                
        if consumed:
            return self._handleComsumption(consumed, startLine, startColumn)
        else:
            return None
        
    def _updatePosInfo(self, content):
        
        res = None
        
        for i in range(len(content)):
            
            ch = content[i]
            if not ord(ch) == WSCharCode.LINEBREAK:
                self._column += 1
            else:
                self._line += 1
                self._column = 0
                
            if i == 0:
                res = (self._line, self._column)
            
        return res
        
    def _checkForModeChange(self,consumed):
        
        res = consumed
                
        if not consumed:
            return res
                
        if self._mode == LexerMode.NORMAL:
                
            if self._lineCommentEnabled and \
               self._matchesCommentBegin(consumed, self._lineCommentStart):
                
                self._mode = LexerMode.LINE_COMMENT
                self._inputBuffer = InputBuffer(self._instream,
                                                fillSize = 1 # <- Länge von '\n'
                                                )
                
                res = ""
                    
            elif self._blockCommentEnabled and \
                 self._matchesCommentBegin(consumed, self._blockCommentStart):
                
                self._mode = LexerMode.BLOCK_COMMENT
                size = len(self._blockCommentEnd);
                self._inputBuffer = InputBuffer(self._instream,
                                                fillSize = size
                                                )
                res = ""
                
        else:
            
            self._mode = LexerMode.NORMAL
            self._inputBuffer = InputBuffer(self._instream,
                                            fillSize = 2 # <- Länge zwei wg. Escape-Zeichen
                                            )
            res = ""
        
        return res
  
    def _getTokens(self, text, startLine, startColumn):

        # Handle literals:
        if self._literal:
            token = self._literal.createToken(text)
            if token:
                token.setStartPosition(startLine, startColumn)
                return [token]
        
        res = []

        # Find separators and split:
        for sep in self._separators:

            token = sep.createToken(text)

            if token:
                # Reihenfolge wg. POP-Logik vertauschen...
                right = sep.getRemainingRight(text)
                left = sep.getRemainingLeft(text)
                
                if right:
                    col = startColumn + len(left) + len(token.getText())
                    res = self._getTokens(right, startLine, col)
                    
                token.setStartPosition(startLine, startColumn + len(left))

                res.append(token)
                
                if left:
                    res += self._getTokens(left, startLine, startColumn)

                return res

        # Find prefixes:
        for prefix in self._prefixes:

            token = prefix.createToken(text)

            if token:

                right = prefix.getRemainingRight(text)
                if right:
                    col = startColumn + len(token.getText())
                    res = self._getTokens(right, startLine, col)
                    
                token.setStartPosition(startLine, startColumn)

                res.append(token)

                return res

        # Find postfixes:
        for postfix in self._postfixes:

            token = postfix.createToken(text)

            if token:
                
                left = postfix.getRemainingLeft(text)
                
                col = startColumn + len(left)
                token.setStartPosition(startLine, col)

                res.append(token)

                if left:
                    res += self._getTokens(left, startLine, startColumn)

                return res 

        # Find (key)words:
        
        try:
            matchingWords = [self._keywords[text]]
        except KeyError:
            # maybe case insensitive keyword?
            try:
                kw = self._keywords[text.upper()]
                if not kw.isCaseSensitive():
                    matchingWords = [kw]
                else:
                    matchingWords = []
            except KeyError:
                matchingWords = []
        
        matchingWords += [word for word in self._words if word.matches(text)]

        if matchingWords:
            token = Token(text, matchingWords)
            token.setStartPosition(startLine, startColumn)
            res.append(token)
            return res
        
        raise Exception("Unknown token '%s' at line %d, column %d" % (text, startLine, startColumn))
    
    def _handleComsumption(self, consumed, startLine, startColumn):
        
        consumed = self._checkForModeChange(consumed)
                
        if consumed:
            self._stack = self._getTokens(consumed, startLine, startColumn)
            if self._stack:
                return self._stack.pop()
            else:
                raise Exception("Unknown token '%s' at line %d, column %d" \
                                % (consumed, startLine, startColumn))
        
        return None
    
    def _consume(self):

        text = self._inputBuffer.getContent()
        if not text:
            raise Exception('Must not consume empty content')
        
        if self._mode == LexerMode.NORMAL:

            isTermination = False
            consumed = ''
            
            textLen = len(text)
            lastIdx = textLen - 1
            prevChar = None
            for idx in range(textLen):
                ch = text[idx]
                if idx != lastIdx or ch != self._literalEscChar or textLen == 1:
                    consumedChar = self._inputBuffer.consumeChar()
                    if prevChar is None or prevChar != self._literalEscChar:
                        isTermination = self._isWhiteSpace(consumedChar)
                        if isTermination:
                            break
                        consumed += consumedChar
                    elif consumedChar not in self._literalDelims:
                        consumed += consumedChar
                    else:
                        consumed = consumed[:-1] + consumedChar
                else:
                    # Escape-Zeichen an letzter Position nicht konsumieren
                    break
                prevChar = consumedChar
            
        elif self._mode == LexerMode.LINE_COMMENT:
            
            isTermination = ord(text) == WSCharCode.LINEBREAK
            consumed = self._inputBuffer.consumeAll()
                                
        elif self._mode == LexerMode.BLOCK_COMMENT:
            
            isTermination = text == self._blockCommentEnd
            consumed = self._inputBuffer.consumeAll()
                        
        else:
            
            raise Exception("Undefined lexer mode")
            
        return consumed, isTermination

    def _isWhiteSpace(self, ch):

        if ch in self._literalDelims:
            if self._currentLitDelim:
                if ch == self._currentLitDelim:
                    self._currentLitDelim = ""
            else:
                self._currentLitDelim = ch
            return False
        elif self._currentLitDelim:
            return False
        else:
            return ord(ch) in self._wsCharCodes
    
    def _matchesCommentBegin(self, consumed, commentBegin):
        
        tmp = commentBegin
        for specialChar in ['*']:
            tmp = tmp.replace(specialChar, '\\' + specialChar)
        
        regex = r"\A%s.*\Z" % tmp
        
        return bool(re.match(regex, consumed))