def Read(self, lex_mode): #assert self.line_pos <= len(self.line), (self.line, self.line_pos) tok_type, end_pos = self.match_func(lex_mode, self.line, self.line_pos) #assert end_pos <= len(self.line) if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel! return ast.token(tok_type, '', const.NO_INTEGER) tok_val = self.line[self.line_pos:end_pos] # NOTE: tok_val is redundant, but even in osh.asdl we have some separation # between data needed for formatting and data needed for execution. Could # revisit this later. # TODO: Add this back once arena is threaded everywhere #assert self.line_id != -1 line_span = ast.line_span(self.line_id, self.line_pos, len(tok_val)) # NOTE: We're putting the arena hook in LineLexer and not Lexer because we # want it to be "low level". The only thing fabricated here is a newline # added at the last line, so we don't end with \0. if self.arena_skip: assert self.last_span_id != const.NO_INTEGER span_id = self.last_span_id self.arena_skip = False else: span_id = self.arena.AddLineSpan(line_span) self.last_span_id = span_id #log('LineLexer.Read() span ID %d for %s', span_id, tok_type) t = ast.token(tok_type, tok_val, span_id) self.line_pos = end_pos return t
def testPipeline2(self): Banner('ls | cut -d . -f 1 | head') p = process.Pipeline() p.Add(_ExtProc(['ls'])) p.Add(_ExtProc(['cut', '-d', '.', '-f', '1'])) p.Add(_ExtProc(['head'])) print(p.Run(_WAITER)) ex = InitExecutor() # Simulating subshell for each command w1 = ast.CompoundWord() w1.parts.append(ast.LiteralPart(ast.token(Id.Lit_Chars, 'ls'))) node1 = ast.SimpleCommand() node1.words = [w1] w2 = ast.CompoundWord() w2.parts.append(ast.LiteralPart(ast.token(Id.Lit_Chars, 'head'))) node2 = ast.SimpleCommand() node2.words = [w2] w3 = ast.CompoundWord() w3.parts.append(ast.LiteralPart(ast.token(Id.Lit_Chars, 'sort'))) w4 = ast.CompoundWord() w4.parts.append(ast.LiteralPart(ast.token(Id.Lit_Chars, '--reverse'))) node3 = ast.SimpleCommand() node3.words = [w3, w4] p = process.Pipeline() p.Add(Process(process.SubProgramThunk(ex, node1))) p.Add(Process(process.SubProgramThunk(ex, node2))) p.Add(Process(process.SubProgramThunk(ex, node3))) print(p.Run(_WAITER))
def testRead(self): lexer = _InitLexer(CMD) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'ls'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.WS_Space, ' '), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, '/'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Op_Newline, '\n'), t) # Line two t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'ls'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.WS_Space, ' '), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, '/home/'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Op_Newline, '\n'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Eof_Real, ''), t) # Another EOF gives EOF t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Eof_Real, ''), t)
def testTokens(self): print(Id.Op_Newline) print(ast.token(Id.Op_Newline, '\n')) print(IdName(Id.Op_Newline)) print(Kind.Eof) print(Kind.Left) print('--') for name in dir(Kind): if name[0].isupper(): print(name, getattr(Kind, name)) # Make sure we're not exporting too much print(dir(id_kind)) # 206 out of 256 tokens now print(len(id_kind._ID_NAMES)) t = ast.token(Id.Arith_Plus, '+') self.assertEqual(Kind.Arith, LookupKind(t.id)) t = ast.token(Id.Arith_CaretEqual, '^=') self.assertEqual(Kind.Arith, LookupKind(t.id)) t = ast.token(Id.Arith_RBrace, '}') self.assertEqual(Kind.Arith, LookupKind(t.id)) t = ast.token(Id.BoolBinary_DEqual, '==') self.assertEqual(Kind.BoolBinary, LookupKind(t.id))
def testShellFuncExecution(self): ex = cmd_exec_test.InitExecutor() func_node = ast.FuncDef() c1 = ast.CompoundWord() t1 = ast.token(Id.Lit_Chars, 'f1') c1.parts.append(ast.LiteralPart(t1)) c2 = ast.CompoundWord() t2 = ast.token(Id.Lit_Chars, 'f2') c2.parts.append(ast.LiteralPart(t2)) a = ast.ArrayLiteralPart() a.words = [c1, c2] w = ast.CompoundWord() w.parts.append(a) # Set global COMPREPLY=(f1 f2) pair = ast.assign_pair(ast.LhsName('COMPREPLY'), assign_op_e.Equal, w) pair.spids.append(0) # dummy pairs = [pair] body_node = ast.Assignment(Id.Assign_None, [], pairs) func_node.name = 'myfunc' func_node.body = body_node a = completion.ShellFuncAction(ex, func_node) matches = list(a.Matches([], 0, 'f')) self.assertEqual(['f1 ', 'f2 '], matches)
def LookAhead(self, lex_mode): """Look ahead for a non-space token, using the given lexer mode. Does NOT advance self.line_pos. Called with at least the following modes: lex_mode_e.ARITH -- for ${a[@]} vs ${a[1+2]} lex_mode_e.VS_1 lex_mode_e.OUTER """ pos = self.line_pos #print('Look ahead from pos %d, line %r' % (pos,self.line)) while True: if pos == len(self.line): # We don't allow lookahead while already at end of line, because it # would involve interacting with the line reader, and we never need # it. In the OUTER mode, there is an explicit newline token, but # ARITH doesn't have it. t = ast.token(Id.Unknown_Tok, '', const.NO_INTEGER) return t tok_type, end_pos = self.match_func(lex_mode, self.line, pos) tok_val = self.line[pos:end_pos] # NOTE: Instead of hard-coding this token, we could pass it in. This # one only appears in OUTER state! LookAhead(lex_mode, past_token_type) if tok_type != Id.WS_Space: break pos = end_pos return ast.token(tok_type, tok_val, const.NO_INTEGER)
def testVarOps(self): ev = InitEvaluator() # initializes x=xxx and y=yyy unset_sub = ast.BracedVarSub(ast.token(Id.VSub_Name, 'unset')) part_vals = [] ev._EvalWordPart(unset_sub, part_vals) print(part_vals) set_sub = ast.BracedVarSub(ast.token(Id.VSub_Name, 'x')) part_vals = [] ev._EvalWordPart(set_sub, part_vals) print(part_vals) # Now add some ops part = ast.LiteralPart(ast.token(Id.Lit_Chars, 'default')) arg_word = ast.CompoundWord([part]) test_op = ast.StringUnary(Id.VTest_ColonHyphen, arg_word) unset_sub.suffix_op = test_op set_sub.suffix_op = test_op part_vals = [] ev._EvalWordPart(unset_sub, part_vals) print(part_vals) part_vals = [] ev._EvalWordPart(set_sub, part_vals) print(part_vals)
def testDollarSqState(self): lexer = _InitLexer(r'foo bar\n \x00 \000 \u0065') t = lexer.Read(lex_mode_e.DOLLAR_SQ) print(t) self.assertTokensEqual(ast.token(Id.Char_Literals, 'foo bar'), t) t = lexer.Read(lex_mode_e.DOLLAR_SQ) print(t) self.assertTokensEqual(ast.token(Id.Char_OneChar, r'\n'), t)
def testToken(self): t = ast.token(Id.Lit_Chars, 'abc') print(t) # This redundancy is OK I guess. t = ast.token(Id.Lit_LBrace, '{') print(t) t = ast.token(Id.Op_Semi, ';') print(t)
def testLookAhead(self): # Lines always end with '\n' l = LineLexer(LEXER_DEF, '') self.assertTokensEqual(ast.token(Id.Eof_Real, ''), l.LookAhead(LexMode.OUTER)) l = LineLexer(LEXER_DEF, 'foo') self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), l.Read(LexMode.OUTER)) self.assertTokensEqual(ast.token(Id.Eof_Real, ''), l.LookAhead(LexMode.OUTER)) l = LineLexer(LEXER_DEF, 'foo bar') self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), l.Read(LexMode.OUTER)) self.assertEqual(ast.token(Id.Lit_Chars, 'bar'), l.LookAhead(LexMode.OUTER)) # No lookahead; using the cursor! l = LineLexer(LEXER_DEF, 'func(') self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), l.Read(LexMode.OUTER)) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), l.LookAhead(LexMode.OUTER)) l = LineLexer(LEXER_DEF, 'func (') self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), l.Read(LexMode.OUTER)) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), l.LookAhead(LexMode.OUTER))
def testExtGlob(self): lexer = _InitLexer('@(foo|bar)') t = lexer.Read(lex_mode_e.OUTER) self.assertTokensEqual(ast.token(Id.ExtGlob_At, '@('), t) t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), t) t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.Op_Pipe, '|'), t) t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'bar'), t) t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.Op_RParen, ')'), t) # Individual cases lexer = _InitLexer('@(') t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.ExtGlob_At, '@('), t) lexer = _InitLexer('*(') t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.ExtGlob_Star, '*('), t) lexer = _InitLexer('?(') t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.ExtGlob_QMark, '?('), t) lexer = _InitLexer('$') t = lexer.Read(lex_mode_e.EXTGLOB) self.assertTokensEqual(ast.token(Id.Lit_Other, '$'), t)
def testLookAhead(self): # Lines always end with '\n' l = LineLexer(parse_lib._MakeMatcher(), '', self.arena) self.assertTokensEqual(ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER)) l = LineLexer(parse_lib._MakeMatcher(), 'foo', self.arena) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER)) self.assertTokensEqual(ast.token(Id.Unknown_Tok, ''), l.LookAhead(lex_mode_e.OUTER)) l = LineLexer(parse_lib._MakeMatcher(), 'foo bar', self.arena) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), l.Read(lex_mode_e.OUTER)) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'bar'), l.LookAhead(lex_mode_e.OUTER)) # No lookahead; using the cursor! l = LineLexer(parse_lib._MakeMatcher(), 'func(', self.arena) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER)) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), l.LookAhead(lex_mode_e.OUTER)) l = LineLexer(parse_lib._MakeMatcher(), 'func (', self.arena) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), l.Read(lex_mode_e.OUTER)) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), l.LookAhead(lex_mode_e.OUTER))
def testBashRegexState(self): lexer = _InitLexer('(foo|bar)') t = lexer.Read(LexMode.BASH_REGEX) self.assertTokensEqual(ast.token(Id.Lit_Chars, '('), t) t = lexer.Read(LexMode.BASH_REGEX) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'foo'), t) t = lexer.Read(LexMode.BASH_REGEX) self.assertTokensEqual(ast.token(Id.Lit_Chars, '|'), t)
def testVarOps(self): ev = InitEvaluator() # initializes x=xxx and y=yyy unset_sub = ast.BracedVarSub(ast.token(Id.Lit_Chars, 'unset')) print(ev.EvalVarSub(unset_sub)) set_sub = ast.BracedVarSub(ast.token(Id.Lit_Chars, 'x')) print(ev.EvalVarSub(set_sub)) part = ast.LiteralPart(ast.token(Id.Lit_Chars, 'default')) arg_word = ast.CompoundWord([part]) test_op = ast.StringUnary(Id.VTest_ColonHyphen, arg_word) unset_sub.suffix_op = test_op set_sub.suffix_op = test_op print(ev.EvalVarSub(unset_sub)) print(ev.EvalVarSub(set_sub))
def _Read(self, lex_mode): if self.line_lexer.AtEnd(): line_id, line = self.line_reader.GetLine() if line is None: # no more lines t = ast.token(Id.Eof_Real, '', -1) # No line number. I guess we are showing the last line of the file. # TODO: Could keep track of previous position for this case? return t self.line_lexer.Reset(line, line_id) t = self.line_lexer.Read(lex_mode) # e.g. translate ) or ` into EOF if self.translation_stack: old_id, new_id = self.translation_stack[-1] # top if t.id == old_id: new_s = IdName(new_id) #print('==> TRANSLATING %s ==> %s' % (t, new_s)) self.translation_stack.pop() #print(self.translation_stack) t.id = new_id return t
def _MaybeReadHereDocs(self): for h in self.pending_here_docs: lines = [] #log('HERE %r' % h.here_end) while True: # If op is <<-, strip off all leading tabs (NOT spaces). # (in C++, just bump the start?) line_id, line = self.line_reader.GetLine() #print("LINE %r %r" % (line, h.here_end)) if not line: # EOF # An unterminated here doc is just a warning in bash. We make it # fatal because we want to be strict, and because it causes problems # reporting other errors. # Attribute it to the << in <<EOF for now. self.AddErrorContext('Unterminated here doc', span_id=h.spids[0]) return False # NOTE: Could do this runtime to preserve LST. if h.op_id == Id.Redir_DLessDash: line = line.lstrip('\t') if line.rstrip() == h.here_end: break lines.append((line_id, line)) parts = [] if h.do_expansion: # NOTE: We read all lines at once, instead of doing it line-by-line, # because of cases like this: # cat <<EOF # 1 $(echo 2 # echo 3) 4 # EOF from osh import parse_lib # Avoid circular import w_parser = parse_lib.MakeWordParserForHereDoc(lines, self.arena) word = w_parser.ReadHereDocBody() if not word: self.AddErrorContext( 'Error reading here doc body: %s', w_parser.Error()) return False h.body = word h.was_filled = True else: # Each line is a single span. TODO: Add span_id to token. tokens = [ ast.token(Id.Lit_Chars, line, const.NO_INTEGER) for _, line in lines] parts = [ast.LiteralPart(t) for t in tokens] h.body = ast.CompoundWord(parts) h.was_filled = True # No .clear() until Python 3.3. del self.pending_here_docs[:] return True
def _MaybeReadHereDocs(self, node): here_docs = _GetHereDocsToFill(node) #print('') #print('--> FILLING', here_docs) #print('') for h in here_docs: lines = [] #print(h.here_end) while True: # If op is <<-, strip off all leading tabs (NOT spaces). # (in C++, just bump the start?) line_id, line = self.line_reader.GetLine() #print("LINE %r %r" % (line, h.here_end)) if not line: # EOF print('WARNING: unterminated here doc', file=sys.stderr) break if h.op_id == Id.Redir_DLessDash: line = line.lstrip('\t') if line.rstrip() == h.here_end: break lines.append((line_id, line)) parts = [] if h.do_expansion: # NOTE: We read all lines at once, instead of doing it line-by-line, # because of cases like this: # cat <<EOF # 1 $(echo 2 # echo 3) 4 # EOF # TODO: Move this import from osh import parse_lib # TODO: Thread arena. need self.arena w_parser = parse_lib.MakeWordParserForHereDoc(lines) word = w_parser.ReadHereDocBody() if not word: self.AddErrorContext('Error reading here doc body: %s', w_parser.Error()) return False h.arg_word = word h.was_filled = True else: # TODO: Add span_id to token # Each line is a single span. tokens = [ast.token(Id.Lit_Chars, line) for _, line in lines] parts = [ast.LiteralPart(t) for t in tokens] h.arg_word = ast.CompoundWord(parts) h.was_filled = True #print('') #print('--> FILLED', here_docs) #print('') return True
def LookAhead(self, lex_mode): """Look ahead for a non-space token, using the given lexical state.""" pos = self.line_pos #print('Look ahead from pos %d, line %r' % (pos,self.line)) while True: if pos == len(self.line): t = ast.token(Id.Eof_Real, '') # no location return t re_list = self.lexer_def[lex_mode] end_index, tok_type, tok_val = FindLongestMatch( re_list, self.line, pos) # NOTE: Instead of hard-coding this token, we could pass it in. This one # only appears in OUTER state! LookAhead(lex_mode, past_token_type) if tok_type != Id.WS_Space: break pos = end_index return ast.token(tok_type, tok_val) # no location
def testLookAhead(self): # I think this is the usage pattern we care about. Peek and Next() past # the function; then Peek() the next token. Then Lookahead in that state. lexer = _InitLexer('func()') t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), t) #self.assertEqual(Id.Op_LParen, lexer.LookAhead()) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), t) self.assertTokensEqual(ast.token(Id.Op_RParen, ')'), lexer.LookAhead(LexMode.OUTER)) lexer = _InitLexer('func ()') t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.Lit_Chars, 'func'), t) t = lexer.Read(LexMode.OUTER) self.assertTokensEqual(ast.token(Id.WS_Space, ' '), t) self.assertTokensEqual(ast.token(Id.Op_LParen, '('), lexer.LookAhead(LexMode.OUTER))
def testReadOuter(self): # Lines always end with '\n' l = LineLexer(LEXER_DEF, '') try: l.Read(LexMode.OUTER) except AssertionError as e: print(e) else: raise AssertionError('Expected error') l = LineLexer(LEXER_DEF, '\n') self.assertTokensEqual(ast.token(Id.Op_Newline, '\n'), l.Read(LexMode.OUTER))
def _assertReadWordWithArena(test, word_str): print('\n---', word_str) arena, w_parser = _InitWordParserWithArena(word_str) w = w_parser.ReadWord(LexMode.OUTER) if w: ast.PrettyPrint(w) else: err = w_parser.Error() test.fail("Couldn't parse %r: %s" % (word_str, err)) # Next word must be \n w2 = w_parser.ReadWord(LexMode.OUTER) test.assertTrue( TokenWordsEqual(ast.TokenWord(ast.token(Id.Op_Newline, '\n')), w2)) return arena, w
def _assertReadWordWithArena(test, word_str): print('\n---', word_str) arena, w_parser = _InitWordParserWithArena(word_str) w = w_parser.ReadWord(lex_mode_e.OUTER) if w: ast.PrettyPrint(w) else: err = w_parser.Error() test.fail("Couldn't parse %r: %s" % (word_str, err)) # Next word must be Eof_Real w2 = w_parser.ReadWord(lex_mode_e.OUTER) test.assertTrue( test_lib.TokenWordsEqual(ast.TokenWord(ast.token(Id.Eof_Real, '')), w2), w2) return arena, w
def _Read(self, lex_mode): t = self.line_lexer.Read(lex_mode) if t.id == Id.Eol_Tok: # hit \0 line_id, line = self.line_reader.GetLine() if line is None: # no more lines span_id = self.line_lexer.GetSpanIdForEof() t = ast.token(Id.Eof_Real, '', span_id) return t self.line_lexer.Reset(line, line_id) t = self.line_lexer.Read(lex_mode) # e.g. translate ) or ` into EOF if self.translation_stack: old_id, new_id = self.translation_stack[-1] # top if t.id == old_id: #print('==> TRANSLATING %s ==> %s' % (t, new_s)) self.translation_stack.pop() t.id = new_id return t
def Read(self, lex_mode): if self.AtEnd(): raise AssertionError('EOF') re_list = self.lexer_def[lex_mode] end_index, tok_type, tok_val = FindLongestMatch( re_list, self.line, self.line_pos) # NOTE: tok_val is redundant, but even in osh.asdl we have some separation # between data needed for formatting and data needed for execution. Could # revisit this later. # TODO: Add this back once arena is threaded everywhere #assert self.line_id != -1 line_span = ast.line_span(self.line_id, self.line_pos, len(tok_val)) # NOTE: We're putting the arena hook in LineLexer and not Lexer because we # want it to be "low level". The only thing fabricated here is a newline # added at the last line, so we don't end with \0. if self.arena is not None: if self.arena_skip: assert self.last_span_id != -1 span_id = self.last_span_id self.arena_skip = False else: span_id = self.arena.AddLineSpan(line_span) self.last_span_id = span_id else: # Completion parser might not have arena? # We should probably get rid of this. span_id = -1 t = ast.token(tok_type, tok_val, span_id) self.line_pos = end_index return t
def testMultiLine(self): w_parser = InitWordParser("""\ ls foo # Multiple newlines and comments should be ignored ls bar """) print('--MULTI') w = w_parser.ReadWord(LexMode.OUTER) parts = [ast.LiteralPart(ast.token(Id.Lit_Chars, 'ls'))] self.assertEqual(ast.CompoundWord(parts), w) w = w_parser.ReadWord(LexMode.OUTER) parts = [ast.LiteralPart(ast.token(Id.Lit_Chars, 'foo'))] self.assertEqual(ast.CompoundWord(parts), w) w = w_parser.ReadWord(LexMode.OUTER) t = ast.token(Id.Op_Newline, '\n') self.assertEqual(ast.TokenWord(t), w) w = w_parser.ReadWord(LexMode.OUTER) parts = [ast.LiteralPart(ast.token(Id.Lit_Chars, 'ls'))] self.assertEqual(ast.CompoundWord(parts), w) w = w_parser.ReadWord(LexMode.OUTER) parts = [ast.LiteralPart(ast.token(Id.Lit_Chars, 'bar'))] self.assertEqual(ast.CompoundWord(parts), w) w = w_parser.ReadWord(LexMode.OUTER) t = ast.token(Id.Op_Newline, '\n') self.assertEqual(ast.TokenWord(t), w) w = w_parser.ReadWord(LexMode.OUTER) t = ast.token(Id.Eof_Real, '') self.assertEqual(ast.TokenWord(t), w)
def testReadOuter(self): l = LineLexer(parse_lib._MakeMatcher(), '\n', self.arena) self.assertTokensEqual(ast.token(Id.Op_Newline, '\n'), l.Read(lex_mode_e.OUTER))
def testDBracketState(self): lexer = _InitLexer('-z foo') t = lexer.Read(LexMode.DBRACKET) self.assertTokensEqual(ast.token(Id.BoolUnary_z, '-z'), t) self.assertEqual(Kind.BoolUnary, LookupKind(t.id))
def TildeDetect(word): """Detect tilde expansion. If it needs to include a TildeSubPart, return a new word. Otherwise return None. NOTE: This algorithm would be a simpler if 1. We could assume some regex for user names. 2. We didn't need to do brace expansion first, like {~foo,~bar} OR - If Lit_Slash were special (it is in the VAROP states, but not OUTER state). We could introduce another lexer mode after you hit Lit_Tilde? So we have to scan all LiteralPart instances until they contain a '/'. http://unix.stackexchange.com/questions/157426/what-is-the-regex-to-validate-linux-users "It is usually recommended to only use usernames that begin with a lower case letter or an underscore, followed by lower case letters, digits, underscores, or dashes. They can end with a dollar sign. In regular expression terms: [a-z_][a-z0-9_-]*[$]? On Debian, the only constraints are that usernames must neither start with a dash ('-') nor contain a colon (':') or a whitespace (space: ' ', end of line: '\n', tabulation: '\t', etc.). Note that using a slash ('/') may break the default algorithm for the definition of the user's home directory. """ if not word.parts: return None part0 = word.parts[0] if _LiteralPartId(part0) != Id.Lit_Tilde: return None prefix = '' found_slash = False # search for the next / for i in range(1, len(word.parts)): # Not a literal part, and we did NOT find a slash. So there is no # TildeSub applied. This would be something like ~X$var, ~$var, # ~$(echo), etc.. The slash is necessary. if word.parts[i].tag != word_part_e.LiteralPart: return None val = word.parts[i].token.val p = val.find('/') if p == -1: # no slash yet prefix += val elif p >= 0: # e.g. for ~foo!bar/baz, extract "bar" # NOTE: requires downcast to LiteralPart pre, post = val[:p], val[p:] prefix += pre tilde_part = ast.TildeSubPart(prefix) # TODO: Need a span_id here. Or use different algorithm. #print('SPLITTING %s p = %d' % (word.parts[i], p), file=sys.stderr) remainder_part = ast.LiteralPart(ast.token(Id.Lit_Chars, post)) found_slash = True break w = ast.CompoundWord() if found_slash: w.parts.append(tilde_part) w.parts.append(remainder_part) j = i + 1 while j < len(word.parts): w.parts.append(word.parts[j]) j += 1 else: # The whole thing is a tilde sub, e.g. ~foo or ~foo!bar w.parts.append(ast.TildeSubPart(prefix)) return w