def createLexers(self): lex = {} lex['.c'] = CFamilyLexer() lex['.h'] = CFamilyLexer() lex['.cpp'] = CppLexer() lex['.hpp'] = CppLexer() lex['.css'] = CssLexer() lex['.sass'] = SassLexer() lex['.yaml'] = YamlLexer() lex['.yml'] = YamlLexer() lex['.json'] = JsonLexer() lex['.cs'] = CSharpLexer() lex['.fs'] = FSharpLexer() lex['.e'] = EiffelLexer() lex['.erl'] = ErlangLexer() lex['.hrl'] = ErlangLexer() lex['.es'] = ErlangLexer() lex['.f03'] = FortranLexer() lex['.f90'] = FortranLexer() lex['.F03'] = FortranLexer() lex['.F90'] = FortranLexer() lex['.go'] = GoLexer() lex['.hs'] = HaskellLexer() lex['.v'] = VerilogLexer() lex['.vhdl'] = VhdlLexer() lex['.vhd'] = VhdlLexer() lex['.html'] = HtmlLexer() lex['.htm'] = HtmlLexer() lex['.xhtml'] = HtmlLexer() lex['.xml'] = XmlLexer() lex['.js'] = JavascriptLexer() lex['.tex'] = TypeScriptLexer() lex['.coffee'] = CoffeeScriptLexer() lex['.java'] = JavaLexer() lex['.scala'] = ScalaLexer() lex['.kt'] = KotlinLexer() lex['.ktm'] = KotlinLexer() lex['.kts'] = KotlinLexer() lex['.lisp'] = CommonLispLexer() lex['make'] = MakefileLexer() lex['Make'] = MakefileLexer() lex['CMake'] = CMakeLexer() lex['cmake'] = CMakeLexer() lex['.m'] = MatlabLexer() lex['.mat'] = MatlabLexer() lex['.dpr'] = DelphiLexer() lex['.perl'] = PerlLexer() lex['.php'] = PhpLexer() lex['.pr'] = PrologLexer() lex['.py'] = Python3Lexer() lex['.rb'] = RubyLexer() lex['.sh'] = BashLexer() lex['.sql'] = MySqlLexer() lex['.mysql'] = MySqlLexer() lex['.tcl'] = TclLexer() lex['.awk'] = AwkLexer() return lex
def parse(cls, location): with io.open(location, encoding='utf-8') as loc: file_contents = loc.read() # we use a Pygments formatter for parsing lexed Ruby code formatted_file_contents = highlight(file_contents, RubyLexer(), ChefMetadataFormatter()) package_data = json.loads(formatted_file_contents) return build_package(package_data, datasource_id=cls.datasource_id)
def POST(self): data = web.input() code = data.code language = data.lang if language == 'python': from pygments.lexers.python import PythonLexer lexer = PythonLexer() elif language == 'php': from pygments.lexers.php import PhpLexer lexer = PhpLexer() elif language == 'java': from pygments.lexers.jvm import JavaLexer lexer = JavaLexer() elif language == 'javascript': from pygments.lexers.javascript import JavascriptLexer lexer = JavascriptLexer() elif language == 'html': from pygments.lexers.html import HtmlLexer lexer = HtmlLexer() elif language == 'cpp': from pygments.lexers.c_cpp import CppLexer lexer = CppLexer() elif language == 'shell': from pygments.lexers.shell import ShellSessionLexer lexer = ShellSessionLexer() elif language == 'matlab': from pygments.lexers.matlab import MatlabLexer lexer = MatlabLexer() elif language == 'ruby': from pygments.lexers.ruby import RubyLexer lexer = RubyLexer() elif language == 'r': from pygments.lexers.r import RConsoleLexer lexer = RConsoleLexer() elif language == 'lisp': from pygments.lexers.lisp import SchemeLexer lexer = SchemeLexer() elif language == 'go': from pygments.lexers.go import GoLexer lexer = GoLexer() formatter = html.HtmlFormatter(linenos=False, encoding='utf-8', nowrap=False) hilighted_snippet = highlight(code, lexer, formatter) #return hilighted #return render.submit() return render.result(hilighted_snippet)
'xslt': XmlLexer(), 'xsd': XmlLexer(), 'wsdl': XmlLexer(), 'php': HtmlPhpLexer(), 'php5': HtmlPhpLexer(), 'pl': Perl6Lexer(), 'pm': Perl6Lexer(), 'nqp': Perl6Lexer(), 'p6': Perl6Lexer(), '6pl': Perl6Lexer(), 'p6l': Perl6Lexer(), 'pl6': Perl6Lexer(), 'p6m': Perl6Lexer(), 'pm6': Perl6Lexer(), 't': Perl6Lexer(), 'rb': RubyLexer(), 'rbw': RubyLexer(), 'rake': RubyLexer(), 'rbx': RubyLexer(), 'duby': RubyLexer(), 'gemspec': RubyLexer(), 'ini': IniLexer(), 'init': IniLexer(), 'sh': BashLexer(), 'diff': DiffLexer(), 'patch': DiffLexer(), 'cs': CSharpLexer(), 'md': MarkdownLexer(), # WAIT: Virker dårlig } known_extensions = list(lexer_from_ext.keys())
def lexer(): yield RubyLexer()
pass if extens == "py" or extens == "pyw" or extens == "sc" or extens == "sage" or extens == "tac": ui_core = CoreUI(lexer=PythonLexer()) elif extens == "txt" or extens == "README" or extens == "text": ui_core = CoreUI(lexer=TextLexer()) elif extens == "htm" or extens == "html" or extens == "css" or extens == "js" or extens == "md": ui_core = CoreUI(lexer=HtmlLexer()) elif extens == "xml" or extens == "xsl" or extens == "rss" or extens == "xslt" or extens == "xsd" or extens == "wsdl" or extens == "wsf": ui_core = CoreUI(lexer=XmlLexer()) elif extens == "php" or extens == "php5": ui_core = CoreUI(lexer=HtmlPhpLexer()) elif extens == "pl" or extens == "pm" or extens == "nqp" or extens == "p6" or extens == "6pl" or extens == "p6l" or extens == "pl6" or extens == "pm" or extens == "p6m" or extens == "pm6" or extens == "t": ui_core = CoreUI(lexer=Perl6Lexer()) elif extens == "rb" or extens == "rbw" or extens == "rake" or extens == "rbx" or extens == "duby" or extens == "gemspec": ui_core = CoreUI(lexer=RubyLexer()) elif extens == "ini" or extens == "init": ui_core = CoreUI(lexer=IniLexer()) elif extens == "conf" or extens == "cnf" or extens == "config": ui_core = CoreUI(lexer=ApacheConfLexer()) elif extens == "sh" or extens == "cmd" or extens == "bashrc" or extens == "bash_profile": ui_core = CoreUI(lexer=BashLexer()) elif extens == "diff" or extens == "patch": ui_core = CoreUI(lexer=DiffLexer()) elif extens == "cs": ui_core = CoreUI(lexer=CSharpLexer()) elif extens == "sql": ui_core = CoreUI(lexer=MySqlLexer()) else: ui_core = CoreUI(lexer=PythonLexer()) # default (no extension) lexer is python ui_core.mainloop()
# 空白を最後に一つだけつけたいので、SPACEやNEWLINEでついた空白を # 除いた後、空白を最後につける return token.rstrip() + " " if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-n", dest="N", required=True, type=int) parser.add_argument("--threshould", dest="threshould", required=True, type=float) parser.add_argument("-t", "--target", required=True, type=argparse.FileType("r"), dest="target_file") parser.add_argument("-v", "--vector", required=True, type=str, dest="vector_path") args = parser.parse_args() model = kenlm.Model(args.vector_path) lexer = RubyLexer() token_stream = lexer.get_tokens(args.target_file.read()) token_str = "" for token_data in token_stream: token_str += replace_special_char(token_data[-1]) token_list = token_str.split(" ") bag_of_ngrams = ngrams(token_list, args.N) index = 0 x = [] y = []
def tokenize(program_path, raw=False): lexer = RubyLexer() token_streams = [] with open(program_path, "r") as f: program = f.readlines() num_of_lines = len(program) last_indent_count = 0 for line in program: line_of_token = [] for token_data in lexer.get_tokens(line): token_type = token_data[0] token = token_data[-1] if raw: if is_token_subtype(token_type, Comment) or is_token_subtype( token_type, Literal): arranged_token = replace_special_char(token, comment=True) else: arranged_token = replace_special_char(token, comment=False) else: if is_token_subtype(token_type, Literal): arranged_token = "<LITERAL>" elif is_token_subtype(token_type, String): arranged_token = "<STRING>" elif is_token_subtype(token_type, Number): arranged_token = "<NUMBER>" elif token_type == Token.Name.Operator: arranged_token = "<OPERATOR>" elif token_type == Name and token not in reserved: arranged_token = "<ID>" elif token_type == Name.Variable.Instance: arranged_token = "<INSTANCE_VAL>" elif token_type == Name.Variable.Class: arranged_token = "<CLASS_VAL>" elif token_type == Name.Constant: arranged_token = "<CONSTANT_ID>" elif token_type == Name.Function: arranged_token = "<FUNCTION>" elif token_type == Name.Class: arranged_token = "<CLASS>" elif token_type == Name.Namespace: arranged_token = "<NAMESPACE>" elif token_type == Token.Name.Variable.Global: arranged_token = "<GLOBAL_VAL>" elif token_type == Token.Error: arranged_token = "<ERROR>" # pygments内で字句解析が失敗した際のトークン (絵文字など) elif is_token_subtype(token_type, Comment): arranged_token = "<COMMENT>" else: arranged_token = replace_special_char(token) # if arranged_token not in reserved and "SPACE" not in arranged_token and "NEWLINE" not in arranged_token: # if token_type not in (Token.Punctuation, Token.Operator, Token.Name.Builtin, Token.Keyword.Pseudo): # print("==============") # print(program_path) # print(line.rstrip()) # print("{} : {}".format(arranged_token.encode("utf-8"), token_type)) # print("==============") line_of_token.append(arranged_token + " ") # 空白区切りにするため、最後にスペースをつける # 行頭の空白二つはインデントとみなす line_of_token[0] = line_of_token[0].replace("<SPACE> <SPACE> ", "<INDENT> ") # インデントは前の行との相対的な値を番号として付与する indent_count = len(re.findall("<INDENT>", line_of_token[0])) if indent_count != 0: # 空行がインデントされていると0番目の要素にインデントと改行が両方含まれている場合があるため、 # インデント情報を取り除いてから、相対的なインデント情報を付け加える indent_char = "<INDENT{}> ".format(indent_count - last_indent_count) line_of_token[0] = line_of_token[0].replace("<INDENT> ", "") line_of_token[0] = indent_char + line_of_token[0] if len(line_of_token) != 1: last_indent_count = indent_count token_streams.append(line_of_token) return token_streams, num_of_lines