def extract_shebang(cls, data): """ Internal: Extract normalized shebang command token. Examples extract_shebang("#!/usr/bin/ruby") # => "ruby" extract_shebang("#!/usr/bin/env node") # => "node" Returns String token or nil it couldn't be parsed. """ s = StringScanner(data) path = s.scan(r'^#!\s*\S+') if path: script = path.split('/')[-1] if script == 'env': s.scan(r'\s+') script = s.scan(r'\S+') if script: script = re.compile(r'[^\d]+').match(script).group(0) return script return
class StringParser(object): def __init__(self, lexicon, str_): self.lexicon = lexicon self.scanner = StringScanner(str_) def __iter__(self): return self def __next__(self): return self.next() def next(self): while not self.scanner.eos(): pos = self.scanner.pos for pattern, cls in self.lexicon: if self.scanner.check(pattern): return cls( self.scanner.scan(pattern), start=self.scanner.prev_pos, end=self.scanner.pos, ) if self.scanner.pos == pos: raise Exception("Cannot tokenize:\n{}".format( self.scanner.rest())) raise StopIteration() def tokenize(self): return [token for token in self]
def test_skip(self): s = StringScanner('test string') self.assertEquals(s.skip(r'\w+'), 4) self.assertEquals(s.skip(r'\w+'), None) self.assertEquals(s.skip(r'\s+'), 1) self.assertEquals(s.skip(r'\w+'), 6) self.assertEquals(s.skip(r'.'), None)
def test_concat(self): source = "Fri Dec 12 1975 14:39" s = StringScanner(source) s.scan(r'Fri ') self.assertRaises(TypeError, s.concat, 1) #s += ' +1000 GMT' s.concat(' +1000 GMT') self.assertEquals(s.string, 'Fri Dec 12 1975 14:39 +1000 GMT')
def test_construction(self): self.assertRaises(TypeError, StringScanner, 1) self.assertEquals(StringScanner('dont care').string, 'dont care') self.assertEquals(StringScanner().pos, 0) self.assertEquals(StringScanner().string, None) _ = StringScanner() self.assertRaises(TypeError, _.string, 1) _.string = 'dont care' self.assertEquals(_.string, 'dont care') self.assertEquals(_.pos, 0)
def __init__(self, source): if type(source) is not type(''): raise TypeError('Type %s is not supported'%type(source)) self.__source = source self.__formats = None self.__scanner = StringScanner(source) self.__comments = [] self.__ignores = []
def test_pre_match_post_match_property(self): s = StringScanner('test string') self.assertEquals(s.scan(r'\w+'), 'test') self.assertEquals(s.scan(r'\s+'), ' ') self.assertEquals(s.pre_match, 'test') self.assertEquals(s.post_match, 'string')
def test_scan_until(self): s = StringScanner("Fri Dec 12 1975 14:39") self.assertEquals(s.scan_until(r'1'), "Fri Dec 1") self.assertEquals(s.pre_match, 'Fri Dec ') self.assertIsNone(s.scan_until(r'XYZ'))
def test_scan(self): source = 'This is an example string' s = StringScanner(source) self.assertFalse(s.eos()) s.pos = len(source) self.assertTrue(s.eos()) s.pos = 0 self.assertEquals(s.scan(r'\w+'), 'This') self.assertIsNone(s.scan(r'\w+')) self.assertEquals(s.scan(r'\s+'), ' ') self.assertIsNone(s.scan(r'\s+')) self.assertEquals(s.scan(r'\w+'), 'is') self.assertFalse(s.eos()) self.assertEquals(s.pos, 7) self.assertEquals(s.scan(r'\s+'), ' ') self.assertEquals(s.scan(r'\w+'), 'an') self.assertEquals(s.scan(r'\s+'), ' ') self.assertEquals(s.scan(r'\w+'), 'example') self.assertEquals(s.scan(r'\s+'), ' ') self.assertEquals(s.scan(r'\w+'), 'string') self.assertTrue(s.eos()) self.assertIsNone(s.scan(r'\s+')) self.assertIsNone(s.scan(r'\w+'))
def test_skip_until(self): s = StringScanner("Fri Dec 12 1975 14:39") self.assertEquals(s.skip_until(r'12'), 10)
def extract_tokens(self, data): """ Internal: Extract generic tokens from data. data - String to scan. Examples extract_tokens("printf('Hello')") # => ['printf', '(', ')'] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: if s.pos >= BYTE_LIMIT: break token = s.scan(r'^#!.+') if token: name = self.extract_shebang(token) if name: tokens.append('SHEBANG#!%s' % name) continue # Single line comment if s.is_beginning_of_line and s.scan(START_SINGLE_LINE_COMMENT): s.skip_until(r'\n|\Z') continue # Multiline comments token = s.scan(START_MULTI_LINE_COMMENT) if token: close_token = dict(MULTI_LINE_COMMENTS).get(token) s.skip_until(re.compile(re.escape(close_token))) continue # Skip single or double quoted strings if s.scan(r'"'): if s.peek(1) == '"': s.getch else: s.skip_until(r'[^\\]"') if s.scan(r"'"): if s.peek(1) == "'": s.getch else: s.skip_until(r"[^\\]'") # Skip number literals if s.scan(r'(0x)?\d(\d|\.)*'): continue # SGML style brackets token = s.scan(r'<[^\s<>][^<>]*>') if token: for t in self.extract_sgml_tokens(token): tokens.append(t) continue # Common programming punctuation token = s.scan(r';|\{|\}|\(|\)|\[|\]') if token: tokens.append(token) continue # Regular token token = s.scan(r'[\w\.@#\/\*]+') if token: tokens.append(token) continue # Common operators token = s.scan(r'<<?|\+|\-|\*|\/|%|&&?|\|\|?') if token: tokens.append(token) continue s.getch return tokens
def extract_sgml_tokens(self, data): """ Internal: Extract tokens from inside SGML tag. data - SGML tag String. Examples extract_sgml_tokens("<a href='' class=foo>") # => ["<a>", "href="] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: # Emit start token token = s.scan(r'<\/?[^\s>]+') if token: tokens.append(token + '>') continue # Emit attributes with trailing = token = s.scan(r'\w+=') if token: tokens.append(token) # Then skip over attribute value if s.scan('"'): s.skip_until(r'[^\\]"') continue if s.scan("'"): s.skip_until(r"[^\\]'") continue s.skip_until(r'\w+') continue # Emit lone attributes token = s.scan(r'\w+') if token: tokens.append(token) # Stop at the end of the tag if s.scan('>'): s.terminate continue s.getch return tokens
def __init__(self, lexicon, str_): self.lexicon = lexicon self.scanner = StringScanner(str_)
class AbstractExtractor(object): __metaclass__ = ABCMeta __definitions = [] def __init__(self, source): if type(source) is not type(''): raise TypeError('Type %s is not supported'%type(source)) self.__source = source self.__formats = None self.__scanner = StringScanner(source) self.__comments = [] self.__ignores = [] #@classmethod @abstractmethod def get_definitions(cls): return cls.__definitions def get_comment_definitions(self): return filter(lambda x:x['type']=='comment', self.get_definitions()) #return self.__definitions['comments'] @property def comments(self): return sorted(set(self.__comments), key=lambda comment:comment[0]) @property def ignores(self): return sorted(set(self.__ignores), key=lambda comment:comment[0]) def extract(self): pass def scan_block_comments(self): for block_comment in [definition for definition in self.get_definitions() if definition['block']]: self.__comments += self.scan(block_comment['startwith'], block_comment['endwith'], block_comment['block']) def scan_line_comments(self): for line_comment in [definition for definition in self.get_definitions() if definition['block'] is False]: self.__comments += self.scan(line_comment['startwith'], None, line_comment['block']) def scan_comments(self): scan_block_comments() scan_line_comments() def scan_ignore(self): pass def scan(self, startwith, endwith, block): result = list() while not self.__scanner.eos(): if self.__scanner.skip_until(re.escape(startwith)) is None: self.__scanner.skip('.*$', re.S) continue spos = self.__scanner.pos - len(startwith) if block is True: # block comment if self.__scanner.skip_until(re.escape(endwith)) is None: raise SyntacticError self.__scanner.skip('.*$', re.S) continue else: # line comment self.__scanner.skip_until(r'.*$', re.M) epos = self.__scanner.pos result.append((spos, epos)) print '>>>> %d:%d'%(spos, epos) print self.__scanner.string[spos:epos] print '<<<<' return result
def scanner(): return StringScanner("bar foobar")