示例#1
0
 def next_tamil_letter(self):
     self.handle = codecs.open(self.filename,'r','utf-8')
     for letter in utf8.get_letters_iterable(self.handle.read()):
         if ( utf8.istamil( letter ) ):
             yield letter
     return
示例#2
0
 def next_tamil_letter(self):
     self.handle = codecs.open(self.filename, 'r', 'utf-8')
     for letter in utf8.get_letters(self.handle.read()):
         if (utf8.istamil(letter)):
             yield letter
     raise StopIteration
 def tokenize(self,data=None):
     """ do hard-work of tokenizing and
     put EzhilLexemes into the tokens[] Q """
     if ( self.debug ): print(u"Start of Ezhil lexer - begin tokenize")
     if ( self.stdin_mode ):
         if ( self.debug ): print(self.tokens)
         ## cleanup the Q for stdin_mode of any EOF that can remain.
         if ( len(self.tokens) != 0 ):
             self.match( EzhilToken.EOF )
         if( len(self.tokens) != 0 ):
             raise ScannerException("Lexer: token Q has previous session tokens ")
         self.tokens = list()
     else:
         if hasattr(self.File,'data'):
             data = self.File.data
         elif self.encoding == "utf-8":
             data = self.File.read()
         elif self.encoding == "tscii":
             if self.debug: print("Loading TSCII converted data -> ")
             data = self.converted_data
         else:
             assert False
     if ( self.debug ): print(data)
     idx = 0
     tok_start_idx = 0
     
     while ( idx < len( data ) ):
         c = data[idx]
         if ( self.debug ): print(idx,c)
         if ( istamil( c ) or c.isalpha( ) or c == u'_' ):
             tok_start_idx = idx
             s = c; idx = idx + 1
             while ( ( idx < len( data ) )
                     and self.is_allowed_for_identifier( data[idx] ) ):
                 s = s + data[idx]
                 idx = idx + 1
             if idx < len(data) and not data[idx].isspace():
                 if  data[idx] in ['#','$','@','\'','"']:
                     raise ScannerException("Lexer: token %s is not valid for identifier, with prefix %s"%(data[idx],s))
             self.get_lexeme( s , tok_start_idx )
         elif  ( c.isspace() ): # or c in u' 'or c == u'\t' or c == u'\n'
             if ( c == u'\n' ):
                 ##actual col = idx - col_idx
                 self.update_line_col(idx)
             idx = idx + 1
         elif ( c == u'\r' ):
             idx = idx + 1
             continue
         elif ( c == u'#' ):
             ## single line skip comments like Python/Octave
             start = idx;
             while ( idx < len( data ) and not (data[idx] in [u'\r',u'\n']) ):
                 idx = idx + 1
             if ( idx < len(data) and data[idx] == u'\r' ):
                 idx = idx + 1
             end = idx
             self.comments[self.line]= data[start:end]
         elif ( c.isdigit() ): #or c == '+' or c == '-'  ):
             num = c
             tok_start_idx = idx
             idx = idx + 1
             ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz 
             ## instead. also may throw an error if we exceed 
             ## buffer-length.                
             if ( c in [u'+',u'-']  and ( idx < len( data ) ) 
                  and not data[idx].isdigit() ):
                 self.get_lexeme( c , idx )
                 continue
             in_sci_notation = False
             while ( ( idx < len( data) )
                         and ( data[idx].isdigit() or data[idx] in [u'+',u'-',u'e',u'E',u'.']) ):
                 if ( data[idx] in [u'+',u'-'] and not in_sci_notation ):
                     break;
                 elif( data[idx] in [u'e',u'E'] ):
                     in_sci_notation = True
                 num = num + data[idx]
                 idx = idx + 1
             self.get_lexeme( num , tok_start_idx  )
         elif ( c == u"\"" ):
             tok_start_idx = idx 
             s = c; idx = idx + 1
             while ( idx < len( data ) and
                      ( data[idx] != u'\"' ) ):
                 if ( data[idx] == u'\\' ):
                     idx = idx + 1
                     if ( data[idx] == u'n' ):
                         s = s + u'\n'
                     elif ( data[idx] == u't' ):
                         s = s +u'\t'
                     else:
                         s = s + data[idx]
                 else:
                     s = s + data[idx]
                 idx  = idx + 1
             s = s+data[idx]
             idx  = idx + 1
             self.get_lexeme( s , tok_start_idx )
         elif ( c in self.unary_binary_ops ):
             tok_start_idx = idx                 
             if ( len(data) > ( 1 + idx  ) 
                  and data[idx+1] in [u'=',u'|',u'&',u'>',u'<'] ):
                 c = c +data[idx+1]
                 idx = idx + 1
             self.get_lexeme(  c , tok_start_idx )
             idx = idx + 1
         elif c == u";":
             # treat as newline
             idx = idx + 1
             continue
         else:
             tok_start_idx = idx 
             idx = idx + 1
             self.get_lexeme( c , tok_start_idx )
     
     tok_start_idx = idx
     ## close the file if not stdin_mode
     if ( not self.stdin_mode ): self.File.close()
     
     ## and manually add an EOF statement.
     eof_tok = EzhilLexeme("",EzhilToken.EOF )
     eof_tok.set_line_col( self.get_line_col( tok_start_idx ) )
     self.tokens.append( eof_tok )
     if ( self.debug ):  print(u"before reverse"); self.dump_tokens()
     self.tokens.reverse()
     if ( self.debug ):  print(u"after reverse"); self.dump_tokens()
     return
    def tokenize(self, data=None):
        """ do hard-work of tokenizing and
        put EzhilLexemes into the tokens[] Q """
        if (self.debug): print(u"Start of Ezhil lexer - begin tokenize")
        if (self.stdin_mode):
            if (self.debug): print(self.tokens)
            ## cleanup the Q for stdin_mode of any EOF that can remain.
            if (len(self.tokens) != 0):
                self.match(EzhilToken.EOF)
            if (len(self.tokens) != 0):
                raise ScannerException(
                    "Lexer: token Q has previous session tokens ")
            self.tokens = list()
        else:
            if hasattr(self.File, 'data'):
                if (self.debug): print("data attribute")
                data = self.File.data
            elif self.encoding == "utf-8":
                data = self.File.read()
            elif self.encoding == "tscii":
                if self.debug: print("Loading TSCII converted data -> ")
                data = self.converted_data
            else:
                assert False
        if (self.debug): print(data)
        idx = 0
        tok_start_idx = 0

        while (idx < len(data)):
            c = data[idx]
            if (self.debug): print(idx, c)
            if (istamil(c) or c.isalpha() or c == u'_'):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while ((idx < len(data))
                       and self.is_allowed_for_identifier(data[idx])):
                    s = s + data[idx]
                    idx = idx + 1
                if idx < len(data) and not data[idx].isspace():
                    if data[idx] in ['#', '$', '@', '\'', '"']:
                        raise ScannerException(
                            "Lexer: token %s is not valid for identifier, with prefix %s"
                            % (data[idx], s))
                self.get_lexeme(s, tok_start_idx)
            elif (c.isspace()):  # or c in u' 'or c == u'\t' or c == u'\n'
                if (c == u'\n'):
                    ##actual col = idx - col_idx
                    self.update_line_col(idx)
                idx = idx + 1
            elif (c == u'\r'):
                idx = idx + 1
                continue
            elif (c == u'#'):
                ## single line skip comments like Python/Octave
                start = idx
                while (idx < len(data) and not (data[idx] in [u'\r', u'\n'])):
                    idx = idx + 1
                if (idx < len(data) and data[idx] == u'\r'):
                    idx = idx + 1
                end = idx
                self.comments[self.line] = data[start:end]
            elif (c.isdigit()):  #or c == '+' or c == '-'  ):
                num = c
                tok_start_idx = idx
                idx = idx + 1
                ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz
                ## instead. also may throw an error if we exceed
                ## buffer-length.
                if (c in [u'+', u'-'] and (idx < len(data))
                        and not data[idx].isdigit()):
                    self.get_lexeme(c, idx)
                    continue
                in_sci_notation = False
                while ((idx < len(data))
                       and (data[idx].isdigit()
                            or data[idx] in [u'+', u'-', u'e', u'E', u'.'])):
                    if (data[idx] in [u'+', u'-'] and not in_sci_notation):
                        break
                    elif (data[idx] in [u'e', u'E']):
                        in_sci_notation = True
                    num = num + data[idx]
                    idx = idx + 1
                self.get_lexeme(num, tok_start_idx)
            elif (c == u"\""):
                tok_start_idx = idx
                s = c
                idx = idx + 1
                while (idx < len(data) and (data[idx] != u'\"')):
                    if (data[idx] == u'\\'):
                        idx = idx + 1
                        if (data[idx] == u'n'):
                            s = s + u'\n'
                        elif (data[idx] == u't'):
                            s = s + u'\t'
                        else:
                            s = s + data[idx]
                    else:
                        s = s + data[idx]
                    idx = idx + 1
                s = s + data[idx]
                idx = idx + 1
                self.get_lexeme(s, tok_start_idx)
            elif (c in self.unary_binary_ops):
                tok_start_idx = idx
                if (len(data) > (1 + idx)
                        and data[idx + 1] in [u'=', u'|', u'&', u'>', u'<']):
                    c = c + data[idx + 1]
                    idx = idx + 1
                self.get_lexeme(c, tok_start_idx)
                idx = idx + 1
            elif c == u";":
                # treat as newline
                idx = idx + 1
                continue
            else:
                tok_start_idx = idx
                idx = idx + 1
                self.get_lexeme(c, tok_start_idx)

        tok_start_idx = idx
        ## close the file if not stdin_mode
        if (not self.stdin_mode): self.File.close()

        ## and manually add an EOF statement.
        eof_tok = EzhilLexeme("", EzhilToken.EOF)
        eof_tok.set_line_col(self.get_line_col(tok_start_idx))
        self.tokens.append(eof_tok)
        if (self.debug):
            print(u"before reverse")
            self.dump_tokens()
        self.tokens.reverse()
        if (self.debug):
            print(u"after reverse")
            self.dump_tokens()
        return
示例#5
0
 def next_tamil_letter(self):
     self.handle = codecs.open(self.filename,'r','utf-8')
     for letter in utf8.get_letters(self.handle.read()):
         if ( utf8.istamil( letter ) ):
             yield letter
     raise StopIteration
 def next_tamil_letter(self):
     self.handle = codecs.open(self.filename,'r','utf-8')
     for letter in utf8.get_letters_iterable(self.handle.read()):
         if ( utf8.istamil( letter ) ):
             yield letter
     return