def tokenize(self, text): i = 0 t = 0 tokens = [""] inString = 0 while i < len(text): if inString == 0 and (text[i] == '"' or text[i] == "'"): inString = 1 tokens[t] += text[i] i += 1 continue elif inString == 1 and text[i] != '"' and text[i] != "'": inString = 0 if text[i] == "#": if len(tokens[t]) == 0: tokens[t] += text[i:] break else: tokens.append(text[i:]) break space = re.match("\s", text[i]) curTokenSpace = re.match("\s", tokens[t]) or len(tokens[t]) == 0 if space: if curTokenSpace: tokens[t] += text[i] else: tokens.append(text[i]) t += 1 elif curTokenSpace: if len(tokens[t]) == 0: tokens[t] += text[i] else: tokens.append(text[i]) t += 1 else: tokens[t] += text[i] i += 1 return tokens
def annotateLine(self, lineNumber, text): # early exit for commented lines if re.match("^\s*#", text): return [AnnotatedFragment(text, "comment")] # other lines have to be tokenized first tokens = self.tokenize(text) fragments = [] for t in tokens: classes = [] if self.keywordMap.has_key(t): classes.append("keyword") if t.startswith("#"): classes.append("comment") fragments.append(AnnotatedFragment(t, " ".join(classes))) return fragments
def isSpace(str): return re.match("\s", str)