def next_tamil_letter(self): self.handle = codecs.open(self.filename,'r','utf-8') for letter in utf8.get_letters_iterable(self.handle.read()): if ( utf8.istamil( letter ) ): yield letter return
def next_tamil_letter(self): self.handle = codecs.open(self.filename, 'r', 'utf-8') for letter in utf8.get_letters(self.handle.read()): if (utf8.istamil(letter)): yield letter raise StopIteration
def tokenize(self,data=None): """ do hard-work of tokenizing and put EzhilLexemes into the tokens[] Q """ if ( self.debug ): print(u"Start of Ezhil lexer - begin tokenize") if ( self.stdin_mode ): if ( self.debug ): print(self.tokens) ## cleanup the Q for stdin_mode of any EOF that can remain. if ( len(self.tokens) != 0 ): self.match( EzhilToken.EOF ) if( len(self.tokens) != 0 ): raise ScannerException("Lexer: token Q has previous session tokens ") self.tokens = list() else: if hasattr(self.File,'data'): data = self.File.data elif self.encoding == "utf-8": data = self.File.read() elif self.encoding == "tscii": if self.debug: print("Loading TSCII converted data -> ") data = self.converted_data else: assert False if ( self.debug ): print(data) idx = 0 tok_start_idx = 0 while ( idx < len( data ) ): c = data[idx] if ( self.debug ): print(idx,c) if ( istamil( c ) or c.isalpha( ) or c == u'_' ): tok_start_idx = idx s = c; idx = idx + 1 while ( ( idx < len( data ) ) and self.is_allowed_for_identifier( data[idx] ) ): s = s + data[idx] idx = idx + 1 if idx < len(data) and not data[idx].isspace(): if data[idx] in ['#','$','@','\'','"']: raise ScannerException("Lexer: token %s is not valid for identifier, with prefix %s"%(data[idx],s)) self.get_lexeme( s , tok_start_idx ) elif ( c.isspace() ): # or c in u' 'or c == u'\t' or c == u'\n' if ( c == u'\n' ): ##actual col = idx - col_idx self.update_line_col(idx) idx = idx + 1 elif ( c == u'\r' ): idx = idx + 1 continue elif ( c == u'#' ): ## single line skip comments like Python/Octave start = idx; while ( idx < len( data ) and not (data[idx] in [u'\r',u'\n']) ): idx = idx + 1 if ( idx < len(data) and data[idx] == u'\r' ): idx = idx + 1 end = idx self.comments[self.line]= data[start:end] elif ( c.isdigit() ): #or c == '+' or c == '-' ): num = c tok_start_idx = idx idx = idx + 1 ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz ## instead. also may throw an error if we exceed ## buffer-length. if ( c in [u'+',u'-'] and ( idx < len( data ) ) and not data[idx].isdigit() ): self.get_lexeme( c , idx ) continue in_sci_notation = False while ( ( idx < len( data) ) and ( data[idx].isdigit() or data[idx] in [u'+',u'-',u'e',u'E',u'.']) ): if ( data[idx] in [u'+',u'-'] and not in_sci_notation ): break; elif( data[idx] in [u'e',u'E'] ): in_sci_notation = True num = num + data[idx] idx = idx + 1 self.get_lexeme( num , tok_start_idx ) elif ( c == u"\"" ): tok_start_idx = idx s = c; idx = idx + 1 while ( idx < len( data ) and ( data[idx] != u'\"' ) ): if ( data[idx] == u'\\' ): idx = idx + 1 if ( data[idx] == u'n' ): s = s + u'\n' elif ( data[idx] == u't' ): s = s +u'\t' else: s = s + data[idx] else: s = s + data[idx] idx = idx + 1 s = s+data[idx] idx = idx + 1 self.get_lexeme( s , tok_start_idx ) elif ( c in self.unary_binary_ops ): tok_start_idx = idx if ( len(data) > ( 1 + idx ) and data[idx+1] in [u'=',u'|',u'&',u'>',u'<'] ): c = c +data[idx+1] idx = idx + 1 self.get_lexeme( c , tok_start_idx ) idx = idx + 1 elif c == u";": # treat as newline idx = idx + 1 continue else: tok_start_idx = idx idx = idx + 1 self.get_lexeme( c , tok_start_idx ) tok_start_idx = idx ## close the file if not stdin_mode if ( not self.stdin_mode ): self.File.close() ## and manually add an EOF statement. eof_tok = EzhilLexeme("",EzhilToken.EOF ) eof_tok.set_line_col( self.get_line_col( tok_start_idx ) ) self.tokens.append( eof_tok ) if ( self.debug ): print(u"before reverse"); self.dump_tokens() self.tokens.reverse() if ( self.debug ): print(u"after reverse"); self.dump_tokens() return
def tokenize(self, data=None): """ do hard-work of tokenizing and put EzhilLexemes into the tokens[] Q """ if (self.debug): print(u"Start of Ezhil lexer - begin tokenize") if (self.stdin_mode): if (self.debug): print(self.tokens) ## cleanup the Q for stdin_mode of any EOF that can remain. if (len(self.tokens) != 0): self.match(EzhilToken.EOF) if (len(self.tokens) != 0): raise ScannerException( "Lexer: token Q has previous session tokens ") self.tokens = list() else: if hasattr(self.File, 'data'): if (self.debug): print("data attribute") data = self.File.data elif self.encoding == "utf-8": data = self.File.read() elif self.encoding == "tscii": if self.debug: print("Loading TSCII converted data -> ") data = self.converted_data else: assert False if (self.debug): print(data) idx = 0 tok_start_idx = 0 while (idx < len(data)): c = data[idx] if (self.debug): print(idx, c) if (istamil(c) or c.isalpha() or c == u'_'): tok_start_idx = idx s = c idx = idx + 1 while ((idx < len(data)) and self.is_allowed_for_identifier(data[idx])): s = s + data[idx] idx = idx + 1 if idx < len(data) and not data[idx].isspace(): if data[idx] in ['#', '$', '@', '\'', '"']: raise ScannerException( "Lexer: token %s is not valid for identifier, with prefix %s" % (data[idx], s)) self.get_lexeme(s, tok_start_idx) elif (c.isspace()): # or c in u' 'or c == u'\t' or c == u'\n' if (c == u'\n'): ##actual col = idx - col_idx self.update_line_col(idx) idx = idx + 1 elif (c == u'\r'): idx = idx + 1 continue elif (c == u'#'): ## single line skip comments like Python/Octave start = idx while (idx < len(data) and not (data[idx] in [u'\r', u'\n'])): idx = idx + 1 if (idx < len(data) and data[idx] == u'\r'): idx = idx + 1 end = idx self.comments[self.line] = data[start:end] elif (c.isdigit()): #or c == '+' or c == '-' ): num = c tok_start_idx = idx idx = idx + 1 ## FIXME: this prevents you from +.xyz, or -.xyz use 0.xyz ## instead. also may throw an error if we exceed ## buffer-length. if (c in [u'+', u'-'] and (idx < len(data)) and not data[idx].isdigit()): self.get_lexeme(c, idx) continue in_sci_notation = False while ((idx < len(data)) and (data[idx].isdigit() or data[idx] in [u'+', u'-', u'e', u'E', u'.'])): if (data[idx] in [u'+', u'-'] and not in_sci_notation): break elif (data[idx] in [u'e', u'E']): in_sci_notation = True num = num + data[idx] idx = idx + 1 self.get_lexeme(num, tok_start_idx) elif (c == u"\""): tok_start_idx = idx s = c idx = idx + 1 while (idx < len(data) and (data[idx] != u'\"')): if (data[idx] == u'\\'): idx = idx + 1 if (data[idx] == u'n'): s = s + u'\n' elif (data[idx] == u't'): s = s + u'\t' else: s = s + data[idx] else: s = s + data[idx] idx = idx + 1 s = s + data[idx] idx = idx + 1 self.get_lexeme(s, tok_start_idx) elif (c in self.unary_binary_ops): tok_start_idx = idx if (len(data) > (1 + idx) and data[idx + 1] in [u'=', u'|', u'&', u'>', u'<']): c = c + data[idx + 1] idx = idx + 1 self.get_lexeme(c, tok_start_idx) idx = idx + 1 elif c == u";": # treat as newline idx = idx + 1 continue else: tok_start_idx = idx idx = idx + 1 self.get_lexeme(c, tok_start_idx) tok_start_idx = idx ## close the file if not stdin_mode if (not self.stdin_mode): self.File.close() ## and manually add an EOF statement. eof_tok = EzhilLexeme("", EzhilToken.EOF) eof_tok.set_line_col(self.get_line_col(tok_start_idx)) self.tokens.append(eof_tok) if (self.debug): print(u"before reverse") self.dump_tokens() self.tokens.reverse() if (self.debug): print(u"after reverse") self.dump_tokens() return
def next_tamil_letter(self): self.handle = codecs.open(self.filename,'r','utf-8') for letter in utf8.get_letters(self.handle.read()): if ( utf8.istamil( letter ) ): yield letter raise StopIteration