def execute(self, context): language = context.scene.hl_language lexer = find_lexer_class(language) code = context.active_object.data.body text = highlight(code, lexer(), RawTokenFormatter()).decode() process(text, context.active_object) return {'FINISHED'}
def generate_syntax_objects(code): print_time_stamp() make_materials() add_fonts() seq_yielder = get_unique_sequential_name() # process data code_as_raw = highlight(code, Python3Lexer(), RawTokenFormatter()) pre_split_lines = code_as_raw.decode('utf-8') # there is a hidden tab inside the regex here. post_split_lines = pre_split_lines.split(r"""Token.Text '\n'""") # write to objects write_lines(post_split_lines, seq_yielder)
def test_invalid_raw_token(): # These should not throw exceptions. assert (highlight("Tolkien", RawTokenLexer(), RawTokenFormatter()) == b"Token.Error\t'Tolkien\\n'\n") assert (highlight("Tolkien\t'x'", RawTokenLexer(), RawTokenFormatter()) == b"Token\t'x'\n") assert (highlight( "Token.Text\t42", RawTokenLexer(), RawTokenFormatter()) == b"Token.Error\t'Token.Text\\t42\\n'\n") assert (highlight( "Token.Text\t'", RawTokenLexer(), RawTokenFormatter()) == b'Token.Error\t"Token.Text\\t\'\\n"\n') assert (highlight("Token.Text\t'α'", RawTokenLexer(), RawTokenFormatter()) == b"Token.Text\t'\\u03b1'\n") assert (highlight("Token.Text\tu'α'", RawTokenLexer(), RawTokenFormatter()) == b"Token.Text\t'\\u03b1'\n") assert (highlight(b"Token.Text\t'\xff'", RawTokenLexer(), RawTokenFormatter()) == b"Token.Text\t'\\xff'\n")
def test_raw_token(): code = "2 + α" raw = highlight(code, PythonLexer(), RawTokenFormatter()) html = highlight(code, PythonLexer(), HtmlFormatter()) assert highlight(raw, RawTokenLexer(), RawTokenFormatter()) == raw assert highlight(raw, RawTokenLexer(), HtmlFormatter()) == html assert highlight(raw.decode(), RawTokenLexer(), HtmlFormatter()) == html raw_gz = highlight(code, PythonLexer(), RawTokenFormatter(compress="gz")) assert gzip.decompress(raw_gz) == raw assert highlight(raw_gz, RawTokenLexer(compress="gz"), RawTokenFormatter()) == raw assert (highlight(raw_gz.decode("latin1"), RawTokenLexer(compress="gz"), RawTokenFormatter()) == raw) raw_bz2 = highlight(code, PythonLexer(), RawTokenFormatter(compress="bz2")) assert bz2.decompress(raw_bz2) == raw assert highlight(raw_bz2, RawTokenLexer(compress="bz2"), RawTokenFormatter()) == raw assert (highlight(raw_bz2.decode("latin1"), RawTokenLexer(compress="bz2"), RawTokenFormatter()) == raw)
def extract_class_and_method(java_dir, base_path): """ 为每个java文件提取class,function,attribute,name. 以文件的hash为key,存为json文件 """ if not os.path.isdir(java_dir): return None files = os.listdir(java_dir) names_dict = {} for f in files: h, i = os.path.splitext(f) if i == '.java': classnames = set() methodnames = set() attributenames = set() names = set() with open(os.path.join(java_dir, f)) as fl: cont = fl.read() x = highlight(cont, JavaLexer(), RawTokenFormatter()) for y in str(x, encoding='utf-8').splitlines(): ys = y.split('\t') if ys[0] == 'Token.Name.Class': classnames.add(eval(ys[1])) elif ys[0] == 'Token.Name.Function': methodnames.add(eval(ys[1])) elif ys[0] == 'Token.Name.Attribute': attributenames.add(eval(ys[1])) elif ys[0] == 'Token.Name': names.add(eval(ys[1])) names_dict[h] = { 'NC': list(classnames), 'NF': list(methodnames), 'NA': list(attributenames), 'N': list(names) } repo = java_dir.strip('/').split('/')[-1] jf = os.path.join(base_path, '{}.names.json'.format(repo)) json.dump(names_dict, open(jf, 'w')) return names_dict
if os.path.isabs(args.source_file): source_file = args.source_file else: source_file = os.path.realpath( os.path.join(os.getcwd(), args.source_file)) if not args.verbose: args.verbose = 0 setup_logger(args.verbose) log.debug('[PARSING] {}'.format(source_file)) with open(source_file, 'r') as f: file_content = f.read() # `Pygments` lexing. lexed_content = highlight(file_content, CoqLexer(), RawTokenFormatter()) # Load the entire file contents into RAM as string # IMPROVMENT: Enhance the `RawTokenFormatter` class to stream this data parsed_content = str(lexed_content, encoding='utf-8').splitlines() del lexed_content # Regular expression matching a raw token line. regex = re.compile("Token\.((?:\w+\.?)+)\s['|\"](.*)['|\"]\n?") # Detect errors in file syntax parsing_errors = check_raw_token_syntax(parsed_content, regex) # If the line format does not fit with regex, log an error # and exit the application if len(parsing_errors):
syntax_width = create_syntax_block(caret, syntax_object) caret.x += syntax_width caret.x = 0.0 print('----newline') caret.y -= line_height # ----------------- main loop print_time_stamp() pymat = bpy.data.materials make_materials(material_library) add_fonts() seq_yielder = get_unique_sequential_name() # ----------------- make raw data code = bpy.context.edit_text.as_string() code_as_raw = highlight(code, Python3Lexer(), RawTokenFormatter()) # ----------------- process data pre_split_lines = code_as_raw.decode('utf-8') post_split_lines = pre_split_lines.split(r"""Token.Text '\n'""") # ----------------- write data write_lines(post_split_lines)
def pygment_mul_line(java_lines): string = '\n'.join(java_lines) if string == '': return list(), dict() x = highlight(string, JavaLexer(), RawTokenFormatter()) x = str(x, encoding='utf-8') tokenList = list() variableDict = dict() nameNum, attNum, clsNum, fucNum = 0, 0, 0, 0 otherDict = dict() floatNum, numberNum, strNum = 0, 0, 0 for y in x.splitlines(): ys = y.split('\t') # print(ys) s = eval(ys[1]) if s == '\n': tokenList.append('<nl>') elif s == 'NewBlock': tokenList.append('<nb>') elif s.isspace(): lines = s.count('\n') for _ in range(lines): tokenList.append('<nl>') elif "Token.Literal.Number.Float" == ys[0]: if s not in otherDict: sT = 'FLOAT{}'.format(floatNum) otherDict[s] = sT floatNum += 1 tokenList.append(otherDict[s]) elif ys[0].startswith('Token.Literal.Number'): if s not in otherDict: sT = 'NUMBER{}'.format(numberNum) otherDict[s] = sT numberNum += 1 tokenList.append(otherDict[s]) elif ys[0].startswith('Token.Literal.String'): if s not in otherDict: sT = 'STRING{}'.format(strNum) otherDict[s] = sT strNum += 1 tokenList.append(otherDict[s]) elif "Token.Name.Namespace" == ys[0]: tokenList.append('NAMESPACE') elif "Token.Comment.Single" == ys[0]: tokenList.append('SINGLE') tokenList.append('<nl>') elif "Token.Comment.Multiline" == ys[0]: lines = s.count('\n') for _ in range(lines): tokenList.append('COMMENT') tokenList.append('<nl>') tokenList.append('COMMENT') elif 'Token.Name.Decorator' == ys[0]: tokenList.append('@') tokenList.append(s[1:].lower()) elif 'Token.Name' == ys[0]: if s not in variableDict: sT = 'n{}'.format(nameNum) variableDict[s] = sT nameNum += 1 tokenList.append(s) elif 'Token.Name.Attribute' == ys[0]: if s not in variableDict: sT = 'a{}'.format(attNum) variableDict[s] = sT attNum += 1 tokenList.append(s) elif 'Token.Name.Class' == ys[0]: if s not in variableDict: sT = 'c{}'.format(clsNum) variableDict[s] = sT clsNum += 1 tokenList.append(s) elif 'Token.Name.Function' == ys[0]: if s not in variableDict: sT = 'f{}'.format(fucNum) variableDict[s] = sT fucNum += 1 tokenList.append(s) else: a = s.splitlines() for i in a: if i != '' and not i.isspace(): tokenList.append(i) tokenList.append('<nl>') tokenList.pop() return tokenList, variableDict
def pygment_one_line(linestring): l = list() namelist = list() attributelist = list() classlist = list() functionlist = list() if len(linestring) < 1 or linestring.startswith( '+++') or linestring.startswith('---'): return l, namelist, attributelist, classlist, functionlist st = linestring[0] # print(st) linestring = linestring[1:].strip() if linestring == '': return l, namelist, attributelist, classlist, functionlist if st == '@': l.append((2, '<NewBlock>')) linestring = linestring[linestring.find('@@') + 3:].strip() if linestring == '': return l, namelist, attributelist, classlist, functionlist cls = 2 elif st == ' ': cls = 2 elif st == '-': cls = 1 elif st == '+': cls = 3 else: return l, namelist, attributelist, classlist, functionlist if linestring.startswith('/*') or linestring.startswith( '*') or linestring.endswith('*/'): l.append((cls, 'JAVADOC')) return l, namelist, attributelist, classlist, functionlist x = highlight(linestring, JavaLexer(), RawTokenFormatter()) x = str(x, encoding='utf-8') for y in x.splitlines(): ys = y.split('\t') print(ys) s = eval(ys[1]).strip(' \t\n\r') if s != '': # print(ys) if "Token.Literal.Number.Float" == ys[0]: l.append((cls, 'FLOAT')) elif "Token.Literal.Number.Integer" == ys[0]: l.append((cls, 'INTEGER')) elif "Token.Literal.Number.Hex" == ys[0]: l.append((cls, 'HEX')) elif "Token.Literal.String" == ys[0]: l.append((cls, 'STRING')) elif "Token.Literal.String.Char" == ys[0]: l.append((cls, 'CHAR')) elif "Token.Name.Namespace" == ys[0]: l.append((cls, 'NAMESPACE')) elif "Token.Comment.Single" == ys[0]: l.append((cls, 'SINGLE')) elif "Token.Comment.Multiline" == ys[0]: l.append((cls, 'MULTILINE')) elif 'Token.Name.Decorator' == ys[0]: l.append((cls, 'DECORATOR')) elif 'Token.Name' == ys[0]: namelist.append(s) l.append((cls, s)) elif 'Token.Name.Attribute' == ys[0]: attributelist.append(s) l.append((cls, s)) elif 'Token.Name.Class' == ys[0]: classlist.append(s) l.append((cls, s)) elif 'Token.Name.Function' == ys[0]: functionlist.append(s) l.append((cls, s)) else: l.append((cls, s)) # print(l) return l, namelist, attributelist, classlist, functionlist
def function Foo for i in range(5): print("hello world!") end function while i < 10 begin inc i print(i) end def function Bar for i in range(5): print("hello world!") end function goto 10 """ print(highlight(code, FooLangLexer(), TerminalFormatter())) input() tokens = highlight(code, FooLangLexer(), RawTokenFormatter()) tokens = tokens.decode() for token in tokens.split("\n"): foobar = token.split("\t") if len(foobar) == 2: print("{token:30} {value}".format(token=foobar[0], value=foobar[1]))
from pygments import highlight from pygments.lexers import PythonLexer from pygments.formatters import RawTokenFormatter code = """ for i in range(1, 11): print("Hello world!") """ tokens = highlight(code, PythonLexer(), RawTokenFormatter()) tokens = tokens.decode() for token in tokens.split("\n"): foobar = token.split("\t") if len(foobar) == 2: print("{token:30} {value}".format(token=foobar[0], value=foobar[1]))
text.pack() # розташувати code = u'print "hello" # коментар' # рядок Python коду text.insert("end", code) # вставити текст в текстовий віджет # конфігурувати теги текстового віджету text.tag_configure("Token.Keyword", foreground='blue', font=('arial', 10, 'bold')) text.tag_configure("Token.Text", foreground='black', font=('arial', 10, 'normal')) text.tag_configure("Token.Literal.String", foreground='red', font=('arial', 10, 'normal')) text.tag_configure("Token.Comment", foreground='darkgreen', font=('arial', 10, 'normal')) code = text.get("1.0", "end-1c") # отримати текст з текстового віджету text.delete("1.0", "end") # видалити весь текст з текстового віджету # перший спосіб: from pygments import highlight # повертає відформатований текст for line in highlight(code, PythonLexer(), RawTokenFormatter()).split("\n"): # для кожного рядка тексту, відформатованого за допомогою PythonLexer() та RawTokenFormatter() pair=line.split("\t") # розділити рядок символом табуляції if pair!=['']: # якщо пара не пуста (token, s) = pair print token, eval(s) # вивести на консоль text.insert("end", eval(s), token) # вставити текст з тегом в віджет # другий спосіб: #from pygments import lex # лексичний аналізатор, повертає ітератор токенів #for token, content in lex(code, PythonLexer()): # print token, content # text.insert("end", content, str(token)) root.mainloop() # головний цикл програми """ ![](fig.png)