def extract_sgml_tokens(self, data): """ Internal: Extract tokens from inside SGML tag. data - SGML tag String. Examples extract_sgml_tokens("<a href='' class=foo>") # => ["<a>", "href="] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] append = tokens.append while not s.is_eos: # Emit start token token = s.scan(REGEX_EMIT_START_TOKEN) if token: append(token + '>') continue # Emit attributes with trailing = token = s.scan(REGEX_EMIT_TRAILING) if token: append(token) # Then skip over attribute value if s.scan(REGEX_DOUBLE_QUOTE): s.skip_until(REGEX_DOUBLE_END_QUOTE) continue if s.scan(REGEX_SINGLE_QUOTE): s.skip_until(REGEX_SINGLE_END_QUOTE) continue s.skip_until(REGEX_EMIT_WORD) continue # Emit lone attributes token = s.scan(REGEX_EMIT_WORD) if token: append(token) # Stop at the end of the tag if s.scan(REGEX_EMIT_END_TAG): s.terminate continue s.getch return tokens
def extract_tokens(self, data): """ Internal: Extract generic tokens from data. data - String to scan. Examples extract_tokens("printf('Hello')") # => ['printf', '(', ')'] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: if s.pos >= BYTE_LIMIT: break token = s.scan(REGEX_SHEBANG) if token: name = self.extract_shebang(token) if name: tokens.append('SHEBANG#!%s' % name) continue # Single line comment if s.is_bol and s.scan(START_SINGLE_LINE_COMMENT): s.skip_until(REGEX_BOL) continue # Multiline comments token = s.scan(START_MULTI_LINE_COMMENT) if token: close_token = MULTI_LINE_COMMENT_DICT[token] s.skip_until(close_token) continue # Skip single or double quoted strings if s.scan(REGEX_DOUBLE_QUOTE): if s.peek(1) == '"': s.getch else: s.skip_until(REGEX_DOUBLE_END_QUOTE) continue if s.scan(REGEX_SINGLE_QUOTE): if s.peek(1) == "'": s.getch else: s.skip_until(REGEX_SINGLE_END_QUOTE) continue # Skip number literals if s.scan(REGEX_NUMBER_LITERALS): continue # SGML style brackets token = s.scan(REGEX_SGML) if token: for t in self.extract_sgml_tokens(token): tokens.append(t) continue # Common programming punctuation token = s.scan(REGEX_COMMON_PUNCTUATION) if token: tokens.append(token) continue # Regular token token = s.scan(REGEX_REGULAR_TOKEN) if token: tokens.append(token) continue # Common operators token = s.scan(REGEX_COMMON_OPERATORS) if token: tokens.append(token) continue s.getch return tokens