def extract_shebang(cls, data): """ Internal: Extract normalized shebang command token. Examples extract_shebang("#!/usr/bin/ruby") # => "ruby" extract_shebang("#!/usr/bin/env node") # => "node" Returns String token or nil it couldn't be parsed. """ s = StringScanner(data) path = s.scan(REGEX_SHEBANG_FULL) if path: script = path.split('/')[-1] if script == 'env': s.scan(REGEX_SHEBANG_WHITESPACE) script = s.scan(REGEX_SHEBANG_NON_WHITESPACE) if script: script = compile(r'[^\d]+').match(script).group(0) return script return
def compile(query): compiled = {"tags": [], "not_tags": []} sanitized_query = re.sub(r"['\"]", "", query.encode('utf8')) scanner = StringScanner(sanitized_query) first_token = True while not scanner.is_eos: token = scanner.scan(_next_token()) if not token: scanner.skip(_separators()) continue if ":" in token: compiled = _compile_tag(compiled, token) elif first_token: compiled["general"] = token if not first_token: first_token = True return compiled
def compile(query): compiled = {"tags": [], "not_tags": [], "general": []} scanner = StringScanner(query.encode('utf8').replace("\"", "")) first_token = True while not scanner.is_eos: token = scanner.scan(_next_token()) if not token: scanner.skip(_separators()) continue if ":" in token: compiled = _compile_tag(compiled, token) elif first_token: compiled["general"].append(token) if not first_token: first_token = True compiled["general"] = ' '.join(compiled["general"]) return SearchQuery(compiled)
def setUp(self): self.string = 'hello' self.strscan = StringScanner(self.string)
def extract_tokens(self, data): """ Internal: Extract generic tokens from data. data - String to scan. Examples extract_tokens("printf('Hello')") # => ['printf', '(', ')'] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: if s.pos >= BYTE_LIMIT: break token = s.scan(REGEX_SHEBANG) if token: name = self.extract_shebang(token) if name: tokens.append('SHEBANG#!%s' % name) continue # Single line comment if s.is_bol and s.scan(START_SINGLE_LINE_COMMENT): s.skip_until(REGEX_BOL) continue # Multiline comments token = s.scan(START_MULTI_LINE_COMMENT) if token: close_token = MULTI_LINE_COMMENT_DICT[token] s.skip_until(close_token) continue # Skip single or double quoted strings if s.scan(REGEX_DOUBLE_QUOTE): if s.peek(1) == '"': s.getch else: s.skip_until(REGEX_DOUBLE_END_QUOTE) continue if s.scan(REGEX_SINGLE_QUOTE): if s.peek(1) == "'": s.getch else: s.skip_until(REGEX_SINGLE_END_QUOTE) continue # Skip number literals if s.scan(REGEX_NUMBER_LITERALS): continue # SGML style brackets token = s.scan(REGEX_SGML) if token: for t in self.extract_sgml_tokens(token): tokens.append(t) continue # Common programming punctuation token = s.scan(REGEX_COMMON_PUNCTUATION) if token: tokens.append(token) continue # Regular token token = s.scan(REGEX_REGULAR_TOKEN) if token: tokens.append(token) continue # Common operators token = s.scan(REGEX_COMMON_OPERATORS) if token: tokens.append(token) continue s.getch return tokens
def extract_sgml_tokens(self, data): """ Internal: Extract tokens from inside SGML tag. data - SGML tag String. Examples extract_sgml_tokens("<a href='' class=foo>") # => ["<a>", "href="] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] append = tokens.append while not s.is_eos: # Emit start token token = s.scan(REGEX_EMIT_START_TOKEN) if token: append(token + '>') continue # Emit attributes with trailing = token = s.scan(REGEX_EMIT_TRAILING) if token: append(token) # Then skip over attribute value if s.scan(REGEX_DOUBLE_QUOTE): s.skip_until(REGEX_DOUBLE_END_QUOTE) continue if s.scan(REGEX_SINGLE_QUOTE): s.skip_until(REGEX_SINGLE_END_QUOTE) continue s.skip_until(REGEX_EMIT_WORD) continue # Emit lone attributes token = s.scan(REGEX_EMIT_WORD) if token: append(token) # Stop at the end of the tag if s.scan(REGEX_EMIT_END_TAG): s.terminate continue s.getch return tokens